1. Add cov_options to allow toggling both group_correction and df_correction on/off for clustered covariance

msukiasyan · msukiasyan · commit 7576ff791c3a · 2025-10-02T14:12:14.000-07:00
2. Set both corrections on by default
3. Add new test for corrections and modify others with the new corrections defaults
diff --git a/econml/inference/_inference.py b/econml/inference/_inference.py
@@ -465,22 +465,28 @@ class StatsModelsInference(LinearModelFinalInference):
     ----------
     cov_type : str, default 'HC1'
         The type of covariance estimation method to use.  Supported values are 'nonrobust',
-        'HC0', 'HC1'.
+        'HC0', 'HC1', 'clustered'.
+    cov_options : dict, optional
+        Additional options for covariance estimation. For clustered covariance, supports:
+        - 'group_correction': bool, default True. Whether to apply N_G/(N_G-1) correction.
+        - 'df_correction': bool, default True. Whether to apply (N-1)/(N-K) correction.
     """
 
-    def __init__(self, cov_type='HC1'):
-        if cov_type not in ['nonrobust', 'HC0', 'HC1']:
+    def __init__(self, cov_type='HC1', cov_options=None):
+        if cov_type not in ['nonrobust', 'HC0', 'HC1', 'clustered']:
             raise ValueError("Unsupported cov_type; "
                              "must be one of 'nonrobust', "
-                             "'HC0', 'HC1'")
+                             "'HC0', 'HC1', 'clustered'")
 
         self.cov_type = cov_type
+        self.cov_options = cov_options if cov_options is not None else {}
 
     def prefit(self, estimator, *args, **kwargs):
         super().prefit(estimator, *args, **kwargs)
         assert not (self.model_final.fit_intercept), ("Inference can only be performed on models linear in "
                                                       "their features, but here fit_intercept is True")
         self.model_final.cov_type = self.cov_type
+        self.model_final.cov_options = self.cov_options
 
 
 class GenericModelFinalInferenceDiscrete(Inference):
@@ -660,21 +666,27 @@ class StatsModelsInferenceDiscrete(LinearModelFinalInferenceDiscrete):
     ----------
     cov_type : str, default 'HC1'
         The type of covariance estimation method to use.  Supported values are 'nonrobust',
-        'HC0', 'HC1'.
+        'HC0', 'HC1', 'clustered'.
+    cov_options : dict, optional
+        Additional options for covariance estimation. For clustered covariance, supports:
+        - 'group_correction': bool, default True. Whether to apply N_G/(N_G-1) correction.
+        - 'df_correction': bool, default True. Whether to apply (N-1)/(N-K) correction.
     """
 
-    def __init__(self, cov_type='HC1'):
-        if cov_type not in ['nonrobust', 'HC0', 'HC1']:
+    def __init__(self, cov_type='HC1', cov_options=None):
+        if cov_type not in ['nonrobust', 'HC0', 'HC1', 'clustered']:
             raise ValueError("Unsupported cov_type; "
                              "must be one of 'nonrobust', "
-                             "'HC0', 'HC1'")
+                             "'HC0', 'HC1', 'clustered'")
 
         self.cov_type = cov_type
+        self.cov_options = cov_options if cov_options is not None else {}
 
     def prefit(self, estimator, *args, **kwargs):
         super().prefit(estimator, *args, **kwargs)
         # need to set the fit args before the estimator is fit
         self.model_final.cov_type = self.cov_type
+        self.model_final.cov_options = self.cov_options
 
 
 class InferenceResults(metaclass=abc.ABCMeta):
diff --git a/econml/iv/dml/_dml.py b/econml/iv/dml/_dml.py
@@ -377,7 +377,8 @@ def __init__(self, *,
                  mc_agg='mean',
                  random_state=None,
                  allow_missing=False,
-                 cov_type="HC0"):
+                 cov_type="HC0",
+                 cov_options=None):
         self.model_y_xw = clone(model_y_xw, safe=False)
         self.model_t_xw = clone(model_t_xw, safe=False)
         self.model_t_xwz = clone(model_t_xwz, safe=False)
@@ -386,6 +387,7 @@ def __init__(self, *,
         self.featurizer = clone(featurizer, safe=False)
         self.fit_cate_intercept = fit_cate_intercept
         self.cov_type = cov_type
+        self.cov_options = cov_options if cov_options is not None else {}
 
         super().__init__(discrete_outcome=discrete_outcome,
                          discrete_instrument=discrete_instrument,
@@ -405,7 +407,7 @@ def _gen_featurizer(self):
         return clone(self.featurizer, safe=False)
 
     def _gen_model_final(self):
-        return StatsModels2SLS(cov_type=self.cov_type)
+        return StatsModels2SLS(cov_type=self.cov_type, cov_options=self.cov_options)
 
     def _gen_ortho_learner_model_final(self):
         return _OrthoIVModelFinal(self._gen_model_final(), self._gen_featurizer(), self.fit_cate_intercept)
diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py
@@ -1694,13 +1694,21 @@ class StatsModelsLinearRegression(_StatsModelsWrapper):
         Whether to fit an intercept in this model
     cov_type : string, default "HC0"
         The covariance approach to use.  Supported values are "HC0", "HC1", "nonrobust", and "clustered".
+    cov_options : dict, optional
+        Additional options for covariance estimation. For clustered covariance, supports:
+        - 'group_correction': bool, default True. Whether to apply N_G/(N_G-1) correction.
+        - 'df_correction': bool, default True. Whether to apply (N-1)/(N-K) correction.
     enable_federation : bool, default False
         Whether to enable federation (aggregating this model's results with other models in a distributed setting).
         This requires additional memory proportional to the number of columns in X to the fourth power.
     """
 
-    def __init__(self, fit_intercept=True, cov_type="HC0", *, enable_federation=False):
+    def __init__(self, fit_intercept=True, cov_type="HC0", cov_options=None, *, enable_federation=False):
         self.cov_type = cov_type
+        self.cov_options = cov_options if cov_options is not None else {}
+        if cov_type == 'clustered':
+            self.cov_options.setdefault('group_correction', True)
+            self.cov_options.setdefault('df_correction', True)
         self.fit_intercept = fit_intercept
         self.enable_federation = enable_federation
 
@@ -2050,8 +2058,10 @@ def _compute_clustered_variance_linear(self, WX, eps_i, sigma_inv, groups):
         group_ids, inverse_idx = np.unique(groups, return_inverse=True)
         n_groups = len(group_ids)
 
-        # Group correction factor
-        group_correction = (n_groups / (n_groups - 1))
+        # Apply correction factors based on cov_options
+        group_correction = (n_groups / (n_groups - 1)) if self.cov_options['group_correction'] else 1.0
+        df_correction = ((n - 1) / (n - k)) if self.cov_options['df_correction'] else 1.0
+        correction = group_correction * df_correction
 
         if eps_i.ndim < 2:
             # Single outcome case
@@ -2060,7 +2070,7 @@ def _compute_clustered_variance_linear(self, WX, eps_i, sigma_inv, groups):
             np.add.at(group_sums, inverse_idx, WX_e)
             s = group_sums.T @ group_sums
 
-            return group_correction * np.matmul(sigma_inv, np.matmul(s, sigma_inv))
+            return correction * np.matmul(sigma_inv, np.matmul(s, sigma_inv))
         else:
             # Multiple outcome case
             var_list = []
@@ -2070,7 +2080,7 @@ def _compute_clustered_variance_linear(self, WX, eps_i, sigma_inv, groups):
                 np.add.at(group_sums, inverse_idx, WX_e)
                 s = group_sums.T @ group_sums
 
-                var_list.append(group_correction * np.matmul(sigma_inv, np.matmul(s, sigma_inv)))
+                var_list.append(correction * np.matmul(sigma_inv, np.matmul(s, sigma_inv)))
 
             return var_list
 
@@ -2162,11 +2172,19 @@ class StatsModels2SLS(_StatsModelsWrapper):
     ----------
     cov_type : {'HC0', 'HC1', 'nonrobust', 'clustered', or None}, default 'HC0'
         Indicates how the covariance matrix is estimated. 'clustered' requires groups to be provided in fit().
+    cov_options : dict, optional
+        Additional options for covariance estimation. For clustered covariance, supports:
+        - 'group_correction': bool, default True. Whether to apply N_G/(N_G-1) correction.
+        - 'df_correction': bool, default True. Whether to apply (N-1)/(N-K) correction.
     """
 
-    def __init__(self, cov_type="HC0"):
+    def __init__(self, cov_type="HC0", cov_options=None):
         self.fit_intercept = False
         self.cov_type = cov_type
+        self.cov_options = cov_options if cov_options is not None else {}
+        if cov_type == 'clustered':
+            self.cov_options.setdefault('group_correction', True)
+            self.cov_options.setdefault('df_correction', True)
         return
 
     def _check_input(self, Z, T, y, sample_weight, groups=None):
@@ -2322,8 +2340,10 @@ def _compute_clustered_variance(self, that, eps_i, thatT_that_inv, groups):
         group_ids, inverse_idx = np.unique(groups, return_inverse=True)
         n_groups = len(group_ids)
 
-        # Group correction factor
-        group_correction = (n_groups / (n_groups - 1))
+        # Apply correction factors based on cov_options
+        group_correction = (n_groups / (n_groups - 1)) if self.cov_options['group_correction'] else 1.0
+        df_correction = ((n - 1) / (n - k)) if self.cov_options['df_correction'] else 1.0
+        correction = group_correction * df_correction
 
         if eps_i.ndim < 2:
             # Single outcome case
@@ -2332,7 +2352,7 @@ def _compute_clustered_variance(self, that, eps_i, thatT_that_inv, groups):
             np.add.at(group_sums, inverse_idx, that_e)
             s = group_sums.T @ group_sums
 
-            return group_correction * np.matmul(thatT_that_inv, np.matmul(s, thatT_that_inv))
+            return correction * np.matmul(thatT_that_inv, np.matmul(s, thatT_that_inv))
         else:
             # Multiple outcome case
             var_list = []
@@ -2342,6 +2362,6 @@ def _compute_clustered_variance(self, that, eps_i, thatT_that_inv, groups):
                 np.add.at(group_sums, inverse_idx, that_e)
                 s = group_sums.T @ group_sums
 
-                var_list.append(group_correction * np.matmul(thatT_that_inv, np.matmul(s, thatT_that_inv)))
+                var_list.append(correction * np.matmul(thatT_that_inv, np.matmul(s, thatT_that_inv)))
 
             return var_list
diff --git a/econml/tests/test_clustered_se.py b/econml/tests/test_clustered_se.py
@@ -110,7 +110,7 @@ def test_clustered_se_without_groups_defaults_to_individual(self):
         T = np.random.binomial(1, 0.5, n)
         Y = np.random.normal(0, 1, n)
 
-        # Clustered SE without groups (defaults to individual groups)
+        # Clustered SE with default corrections (both enabled)
         np.random.seed(123)
         est_clustered = DML(model_y=LassoCV(), model_t=LogisticRegression(),
                            model_final=StatsModelsLinearRegression(fit_intercept=False, cov_type='clustered'),
@@ -129,16 +129,13 @@ def test_clustered_se_without_groups_defaults_to_individual(self):
         lb_clustered, ub_clustered = est_clustered.effect_interval(X_test, alpha=0.05)
         lb_hc0, ub_hc0 = est_hc0.effect_interval(X_test, alpha=0.05)
 
-        # Clustered SE should be HC0 SE * sqrt(n/(n-1)) when each obs is its own cluster
-        # Width of confidence intervals should differ by the adjustment factor
-        width_clustered = ub_clustered - lb_clustered
-        width_hc0 = ub_hc0 - lb_hc0
-
-        # When each observation is its own cluster, clustered SE should equal HC0 * sqrt(n/(n-1))
-        # due to the finite sample correction factor
-        correction_factor = np.sqrt(n / (n - 1))
-        expected_width = width_hc0 * correction_factor
-        np.testing.assert_allclose(width_clustered, expected_width, rtol=1e-10)
+        # With both corrections: sqrt(n/(n-1)) * sqrt((n-1)/(n-k)) = sqrt(n/(n-k))
+        # Get k from the fitted model (includes treatment variable)
+        k_params = est_clustered.model_final_.coef_.shape[0]
+        correction_factor = np.sqrt(n / (n - k_params))
+        expected_width = (ub_hc0 - lb_hc0) * correction_factor
+        actual_width = ub_clustered - lb_clustered
+        np.testing.assert_allclose(actual_width, expected_width, rtol=1e-10)
 
         # Test basic functionality still works
         effects = est_clustered.effect(X_test)
@@ -169,15 +166,11 @@ def test_clustered_se_matches_statsmodels(self):
         sm_model = sm.OLS(Y, X_with_intercept).fit(cov_type='cluster', cov_kwds={'groups': groups})
         sm_se = sm_model.bse[1]  # SE for X[:, 0] coefficient
 
-        # Account for statsmodels' additional n/(n-k) adjustment
-        k = X_with_intercept.shape[1]  # Number of parameters
-        sm_adjustment = np.sqrt((n - 1) / (n - k))
-        adjusted_sm_se = sm_se / sm_adjustment
-
-        # Should match very closely
-        relative_diff = abs(econml_se - adjusted_sm_se) / adjusted_sm_se
+        # Statsmodels applies both G/(G-1) and (N-1)/(N-K) corrections by default
+        # Our implementation also applies both by default, so they should match
+        relative_diff = abs(econml_se - sm_se) / sm_se
         self.assertLess(relative_diff, 1e-4,
-                       f"EconML SE ({econml_se:.8f}) differs from adjusted statsmodels SE ({adjusted_sm_se:.8f})")
+                       f"EconML SE ({econml_se:.8f}) differs from statsmodels SE ({sm_se:.8f})")
 
     def test_clustered_micro_equals_aggregated(self):
         """Test that clustered SE matches for summarized and non-summarized data."""
@@ -238,11 +231,14 @@ def _generate_micro_and_aggregated(rng, *, n_groups=12, cells_per_group=6, d=4,
             (X, ybar, sw, freq, svar, groups), (X_micro, y_micro, sw_micro, groups_micro) = \
                 _generate_micro_and_aggregated(rng, n_groups=10, cells_per_group=7, d=5, p=p)
 
-            m_agg = StatsModelsLinearRegression(fit_intercept=True, cov_type="clustered", enable_federation=False)
+            # Disable DF correction since n differs between aggregated and micro datasets
+            cov_opts = {'group_correction': True, 'df_correction': False}
+            m_agg = StatsModelsLinearRegression(fit_intercept=True, cov_type="clustered",
+                                               cov_options=cov_opts, enable_federation=False)
             m_agg.fit(X, ybar, sample_weight=sw, freq_weight=freq, sample_var=svar, groups=groups)
 
             m_micro = StatsModelsLinearRegression(fit_intercept=True, cov_type="clustered",
-                                                 enable_federation=False)
+                                                 cov_options=cov_opts, enable_federation=False)
             m_micro.fit(
                 X_micro,
                 y_micro,
@@ -255,3 +251,55 @@ def _generate_micro_and_aggregated(rng, *, n_groups=12, cells_per_group=6, d=4,
             np.testing.assert_allclose(m_agg._param, m_micro._param, rtol=1e-12, atol=1e-12)
             np.testing.assert_allclose(np.array(m_agg._param_var), np.array(m_micro._param_var),
                                        rtol=1e-10, atol=1e-12)
+
+    def test_clustered_correction_factors(self):
+        """Test that correction factors are applied correctly."""
+        np.random.seed(42)
+        n = 200
+        n_groups = 20
+        X = np.random.randn(n, 3)
+        groups = np.repeat(np.arange(n_groups), n // n_groups)
+        y = X[:, 0] + 0.5 * X[:, 1] + np.random.randn(n) * 0.5
+
+        # Fit models with different correction options
+        m_none = StatsModelsLinearRegression(
+            cov_type='clustered',
+            cov_options={'group_correction': False, 'df_correction': False}
+        ).fit(X, y, groups=groups)
+
+        m_group = StatsModelsLinearRegression(
+            cov_type='clustered',
+            cov_options={'group_correction': True, 'df_correction': False}
+        ).fit(X, y, groups=groups)
+
+        m_df = StatsModelsLinearRegression(
+            cov_type='clustered',
+            cov_options={'group_correction': False, 'df_correction': True}
+        ).fit(X, y, groups=groups)
+
+        m_both = StatsModelsLinearRegression(
+            cov_type='clustered',
+            cov_options={'group_correction': True, 'df_correction': True}
+        ).fit(X, y, groups=groups)
+
+        # Get actual number of parameters
+        k_params = len(m_none.coef_) + 1
+
+        # Verify group correction
+        group_ratio = m_group.coef_stderr_ / m_none.coef_stderr_
+        expected_group_ratio = np.sqrt(n_groups / (n_groups - 1))
+        np.testing.assert_allclose(group_ratio, expected_group_ratio, rtol=1e-10)
+
+        # Verify DF correction
+        df_ratio = m_df.coef_stderr_ / m_none.coef_stderr_
+        expected_df_ratio = np.sqrt((n - 1) / (n - k_params))
+        np.testing.assert_allclose(df_ratio, expected_df_ratio, rtol=1e-10)
+
+        # Verify combined correction
+        combined_ratio = m_both.coef_stderr_ / m_none.coef_stderr_
+        expected_combined_ratio = np.sqrt(n_groups / (n_groups - 1) * (n - 1) / (n - k_params))
+        np.testing.assert_allclose(combined_ratio, expected_combined_ratio, rtol=1e-10)
+
+        # Verify multiplicative property
+        both_from_components = m_group.coef_stderr_ * m_df.coef_stderr_ / m_none.coef_stderr_
+        np.testing.assert_allclose(m_both.coef_stderr_, both_from_components, rtol=1e-10)