@@ -110,7 +110,7 @@ def test_clustered_se_without_groups_defaults_to_individual(self):
110110 T = np .random .binomial (1 , 0.5 , n )
111111 Y = np .random .normal (0 , 1 , n )
112112
113- # Clustered SE without groups (defaults to individual groups )
113+ # Clustered SE with default corrections (both enabled )
114114 np .random .seed (123 )
115115 est_clustered = DML (model_y = LassoCV (), model_t = LogisticRegression (),
116116 model_final = StatsModelsLinearRegression (fit_intercept = False , cov_type = 'clustered' ),
@@ -129,16 +129,13 @@ def test_clustered_se_without_groups_defaults_to_individual(self):
129129 lb_clustered , ub_clustered = est_clustered .effect_interval (X_test , alpha = 0.05 )
130130 lb_hc0 , ub_hc0 = est_hc0 .effect_interval (X_test , alpha = 0.05 )
131131
132- # Clustered SE should be HC0 SE * sqrt(n/(n-1)) when each obs is its own cluster
133- # Width of confidence intervals should differ by the adjustment factor
134- width_clustered = ub_clustered - lb_clustered
135- width_hc0 = ub_hc0 - lb_hc0
136-
137- # When each observation is its own cluster, clustered SE should equal HC0 * sqrt(n/(n-1))
138- # due to the finite sample correction factor
139- correction_factor = np .sqrt (n / (n - 1 ))
140- expected_width = width_hc0 * correction_factor
141- np .testing .assert_allclose (width_clustered , expected_width , rtol = 1e-10 )
132+ # With both corrections: sqrt(n/(n-1)) * sqrt((n-1)/(n-k)) = sqrt(n/(n-k))
133+ # Get k from the fitted model (includes treatment variable)
134+ k_params = est_clustered .model_final_ .coef_ .shape [0 ]
135+ correction_factor = np .sqrt (n / (n - k_params ))
136+ expected_width = (ub_hc0 - lb_hc0 ) * correction_factor
137+ actual_width = ub_clustered - lb_clustered
138+ np .testing .assert_allclose (actual_width , expected_width , rtol = 1e-10 )
142139
143140 # Test basic functionality still works
144141 effects = est_clustered .effect (X_test )
@@ -169,15 +166,11 @@ def test_clustered_se_matches_statsmodels(self):
169166 sm_model = sm .OLS (Y , X_with_intercept ).fit (cov_type = 'cluster' , cov_kwds = {'groups' : groups })
170167 sm_se = sm_model .bse [1 ] # SE for X[:, 0] coefficient
171168
172- # Account for statsmodels' additional n/(n-k) adjustment
173- k = X_with_intercept .shape [1 ] # Number of parameters
174- sm_adjustment = np .sqrt ((n - 1 ) / (n - k ))
175- adjusted_sm_se = sm_se / sm_adjustment
176-
177- # Should match very closely
178- relative_diff = abs (econml_se - adjusted_sm_se ) / adjusted_sm_se
169+ # Statsmodels applies both G/(G-1) and (N-1)/(N-K) corrections by default
170+ # Our implementation also applies both by default, so they should match
171+ relative_diff = abs (econml_se - sm_se ) / sm_se
179172 self .assertLess (relative_diff , 1e-4 ,
180- f"EconML SE ({ econml_se :.8f} ) differs from adjusted statsmodels SE ({ adjusted_sm_se :.8f} )" )
173+ f"EconML SE ({ econml_se :.8f} ) differs from statsmodels SE ({ sm_se :.8f} )" )
181174
182175 def test_clustered_micro_equals_aggregated (self ):
183176 """Test that clustered SE matches for summarized and non-summarized data."""
@@ -238,11 +231,14 @@ def _generate_micro_and_aggregated(rng, *, n_groups=12, cells_per_group=6, d=4,
238231 (X , ybar , sw , freq , svar , groups ), (X_micro , y_micro , sw_micro , groups_micro ) = \
239232 _generate_micro_and_aggregated (rng , n_groups = 10 , cells_per_group = 7 , d = 5 , p = p )
240233
241- m_agg = StatsModelsLinearRegression (fit_intercept = True , cov_type = "clustered" , enable_federation = False )
234+ # Disable DF correction since n differs between aggregated and micro datasets
235+ cov_opts = {'group_correction' : True , 'df_correction' : False }
236+ m_agg = StatsModelsLinearRegression (fit_intercept = True , cov_type = "clustered" ,
237+ cov_options = cov_opts , enable_federation = False )
242238 m_agg .fit (X , ybar , sample_weight = sw , freq_weight = freq , sample_var = svar , groups = groups )
243239
244240 m_micro = StatsModelsLinearRegression (fit_intercept = True , cov_type = "clustered" ,
245- enable_federation = False )
241+ cov_options = cov_opts , enable_federation = False )
246242 m_micro .fit (
247243 X_micro ,
248244 y_micro ,
@@ -255,3 +251,55 @@ def _generate_micro_and_aggregated(rng, *, n_groups=12, cells_per_group=6, d=4,
255251 np .testing .assert_allclose (m_agg ._param , m_micro ._param , rtol = 1e-12 , atol = 1e-12 )
256252 np .testing .assert_allclose (np .array (m_agg ._param_var ), np .array (m_micro ._param_var ),
257253 rtol = 1e-10 , atol = 1e-12 )
254+
255+ def test_clustered_correction_factors (self ):
256+ """Test that correction factors are applied correctly."""
257+ np .random .seed (42 )
258+ n = 200
259+ n_groups = 20
260+ X = np .random .randn (n , 3 )
261+ groups = np .repeat (np .arange (n_groups ), n // n_groups )
262+ y = X [:, 0 ] + 0.5 * X [:, 1 ] + np .random .randn (n ) * 0.5
263+
264+ # Fit models with different correction options
265+ m_none = StatsModelsLinearRegression (
266+ cov_type = 'clustered' ,
267+ cov_options = {'group_correction' : False , 'df_correction' : False }
268+ ).fit (X , y , groups = groups )
269+
270+ m_group = StatsModelsLinearRegression (
271+ cov_type = 'clustered' ,
272+ cov_options = {'group_correction' : True , 'df_correction' : False }
273+ ).fit (X , y , groups = groups )
274+
275+ m_df = StatsModelsLinearRegression (
276+ cov_type = 'clustered' ,
277+ cov_options = {'group_correction' : False , 'df_correction' : True }
278+ ).fit (X , y , groups = groups )
279+
280+ m_both = StatsModelsLinearRegression (
281+ cov_type = 'clustered' ,
282+ cov_options = {'group_correction' : True , 'df_correction' : True }
283+ ).fit (X , y , groups = groups )
284+
285+ # Get actual number of parameters
286+ k_params = len (m_none .coef_ ) + 1
287+
288+ # Verify group correction
289+ group_ratio = m_group .coef_stderr_ / m_none .coef_stderr_
290+ expected_group_ratio = np .sqrt (n_groups / (n_groups - 1 ))
291+ np .testing .assert_allclose (group_ratio , expected_group_ratio , rtol = 1e-10 )
292+
293+ # Verify DF correction
294+ df_ratio = m_df .coef_stderr_ / m_none .coef_stderr_
295+ expected_df_ratio = np .sqrt ((n - 1 ) / (n - k_params ))
296+ np .testing .assert_allclose (df_ratio , expected_df_ratio , rtol = 1e-10 )
297+
298+ # Verify combined correction
299+ combined_ratio = m_both .coef_stderr_ / m_none .coef_stderr_
300+ expected_combined_ratio = np .sqrt (n_groups / (n_groups - 1 ) * (n - 1 ) / (n - k_params ))
301+ np .testing .assert_allclose (combined_ratio , expected_combined_ratio , rtol = 1e-10 )
302+
303+ # Verify multiplicative property
304+ both_from_components = m_group .coef_stderr_ * m_df .coef_stderr_ / m_none .coef_stderr_
305+ np .testing .assert_allclose (m_both .coef_stderr_ , both_from_components , rtol = 1e-10 )
0 commit comments