Guard winsorized weights against percentile overshoot (#190)

neuralsorcerer · meta-codesync[bot] · commit 421d23ea823d · 2025-12-02T03:03:32.000-08:00
Summary: Added post-winsorization clipping that uses original percentile bounds to prevent numerical overshoots when trimming weights. - Fixes #188 Pull Request resolved: #190 Differential Revision: D88137688 Pulled By: talgalili fbshipit-source-id: de62c0897021c1da03d466e7ca1dfa08c1067d3b
diff --git a/balance/adjustment.py b/balance/adjustment.py
@@ -17,6 +17,7 @@
 import scipy
 
 from balance import util as balance_util
+from balance.testutil import _verify_value_type
 from balance.weighting_methods import (
     adjust_null as balance_adjust_null,
     cbps as balance_cbps,
@@ -272,11 +273,21 @@ def trim_weights(
         else:
             lower_limit = upper_limit = percentile
 
+        # Keep the original requested percentiles for exact clipping bounds,
+        # but validate/adjust separately for the winsorization call so at least
+        # one value is affected at the requested edge.
+        clip_limits = (
+            None if (lower_limit is None or lower_limit == 0) else lower_limit,
+            None if (upper_limit is None or upper_limit == 0) else upper_limit,
+        )
         adjusted_limits = (
             _validate_limit(lower_limit, n_weights),
             _validate_limit(upper_limit, n_weights),
         )
 
+        # Preserve the pre-trim weights to calculate strict clipping bounds.
+        original_weights_for_bounds = weights.copy()
+
         weights = scipy.stats.mstats.winsorize(
             weights, limits=adjusted_limits, inplace=False
         )
@@ -291,6 +302,26 @@ def trim_weights(
             name=original_name,
         )
 
+        # Clip to the exact percentile bounds to avoid small numerical overshoots
+        # from scipy.stats.mstats.winsorize on certain inputs.
+        lower_bound = (
+            None
+            if clip_limits[0] is None
+            else np.quantile(
+                original_weights_for_bounds, clip_limits[0], method="lower"
+            )
+        )
+        upper_bound = (
+            None
+            if clip_limits[1] is None
+            else np.quantile(
+                original_weights_for_bounds,
+                1 - _verify_value_type(clip_limits[1]),
+                method="lower",
+            )
+        )
+        weights = weights.clip(lower=lower_bound, upper=upper_bound)
+
     if keep_sum_of_weights:
         weights = weights / np.mean(weights) * original_mean