[FIX] Initial code to disable multi-proc for stderr (#3106)

neel04 · baberabb · web-flow · commit 71d0289d39c9 · 2025-07-05T00:34:39.000+05:00
* [FIX] Initial code to disable multi-proc for stderr

* add docs; align no-mp bootstrap with mp

---------

Co-authored-by: Baber &lt;baber@hey.com&gt;
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
@@ -1,17 +1,20 @@
 import logging
 import math
+import os
 import random
 import re
 import string
 from collections.abc import Iterable
-from typing import List
+from typing import Callable, List, Optional, Sequence, TypeVar
 
 import numpy as np
 import sacrebleu
 
 from lm_eval.api.registry import register_aggregation, register_metric
 
 
+T = TypeVar("T")
+
 eval_logger = logging.getLogger(__name__)
 
 
@@ -287,7 +290,7 @@ def pop_stddev(arr):
     return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
 
 
-def sample_stddev(arr):
+def sample_stddev(arr: Sequence[T]) -> float:
     mu = mean(arr)
     return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
 
@@ -449,11 +452,16 @@ def _sacreformat(refs, preds):
 
 
 class _bootstrap_internal:
-    def __init__(self, f, n) -> None:
+    """
+    Pool worker: `(i, xs)` → `n` bootstrap replicates
+    of `f(xs)`using a RNG seeded with `i`.
+    """
+
+    def __init__(self, f: Callable[[Sequence[T]], float], n: int) -> None:
         self.f = f
         self.n = n
 
-    def __call__(self, v):
+    def __call__(self, v: tuple[int, Sequence[T]]) -> list[float]:
         i, xs = v
         rnd = random.Random()
         rnd.seed(i)
@@ -463,36 +471,81 @@ def __call__(self, v):
         return res
 
 
-def bootstrap_stderr(f, xs, iters):
-    import multiprocessing as mp
-
-    pool = mp.Pool(mp.cpu_count())
-    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
-    # equivalent to stderr calculated without Bessel's correction in the stddev.
-    # Unfortunately, I haven't been able to figure out what the right correction is
-    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
-    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
-    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+def _bootstrap_internal_no_mp(
+    f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int
+) -> list[float]:
+    """
+    Single-process fallback: compute `iters` bootstrap replicates
+    of statistic`f(xs)`, chunked (≤ 1000 draws).
+    """
     res = []
     chunk_size = min(1000, iters)
     from tqdm import tqdm
 
-    print("bootstrapping for stddev:", f.__name__)
-    for bootstrap in tqdm(
-        pool.imap(
-            _bootstrap_internal(f, chunk_size),
-            [(i, xs) for i in range(iters // chunk_size)],
-        ),
-        total=iters // chunk_size,
-    ):
-        # sample w replacement
-        res.extend(bootstrap)
-
-    pool.close()
+    print(f"bootstrapping for stddev: {f.__name__}")
+
+    # A single loop replaces the multiprocessing pool.
+    for i in tqdm(range(iters // chunk_size)):
+        rnd = random.Random(i)
+        for _ in range(chunk_size):
+            res.append(f(rnd.choices(xs, k=len(xs))))
+
+    return res
+
+
+def bootstrap_stderr(
+    f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int
+) -> float:
+    """
+    Bootstrap estimate of the standard error of statistic `f(xs)`
+    using up to `iters` resamples, chunked (≤ 1000 draws)
+
+    Executes in parallel unless the env-var `DISABLE_MULTIPROC` is set;
+    """
+    if not os.getenv("DISABLE_MULTIPROC"):
+        import multiprocessing as mp
+
+        pool = mp.Pool(mp.cpu_count())
+        # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
+        # equivalent to stderr calculated without Bessel's correction in the stddev.
+        # Unfortunately, I haven't been able to figure out what the right correction is
+        # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
+        # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
+        # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+        res = []
+        chunk_size = min(1000, iters)
+        from tqdm import tqdm
+
+        print("bootstrapping for stddev:", f.__name__)
+        for bootstrap in tqdm(
+            pool.imap(
+                _bootstrap_internal(f, chunk_size),
+                [(i, xs) for i in range(iters // chunk_size)],
+            ),
+            total=iters // chunk_size,
+        ):
+            # sample w replacement
+            res.extend(bootstrap)
+
+        pool.close()
+    else:
+        res = _bootstrap_internal_no_mp(f, xs, iters)
+
     return sample_stddev(res)
 
 
-def stderr_for_metric(metric, bootstrap_iters: int):
+def stderr_for_metric(
+    metric: Callable[[Sequence[T]], float], bootstrap_iters: int
+) -> Optional[Callable[[Sequence[T]], float]]:
+    """
+    Return a function that estimates the standard error of `metric(xs)`.
+
+    * If `bootstrap_iters > 0` and the metric is in the pre-approved
+      bootstrappable list, use `bootstrap_stderr` with that many draws.
+    * If the metric has a closed-form SE (e.g. `mean`, `acc_all`), use it.
+    * Otherwise, return `None`.
+    """
+
     if bootstrap_iters <= 0:
         # return no function (don't compute stderr) if bootstrap iters = 0
         return None
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -1,3 +1,6 @@
+import unittest.mock as mock
+
+from lm_eval.api.metrics import _bootstrap_internal_no_mp, mean
 from lm_eval.api.task import ConfigurableTask, TaskConfig
 
 
@@ -149,8 +152,34 @@ def test_acc_mutual_info_without_metric():
     assert result_dict["acc"] == 1.0
 
 
+def test_bootstrap_internal_no_mp():
+    """Test basic functionality of _bootstrap_internal_no_mp"""
+
+    data = [1, 2, 3, 4, 5]
+
+    # Mock tqdm to avoid progress bar output during testing
+    with mock.patch("tqdm.tqdm") as mock_tqdm:
+        mock_tqdm.return_value = range(1)  # Single chunk
+
+        # Mock print to avoid output during testing
+        with mock.patch("builtins.print"):
+            result = _bootstrap_internal_no_mp(mean, data, 100)
+
+    # Should return 100 bootstrap replicates
+    assert len(result) == 100
+
+    # All results should be numbers (means)
+    assert all(isinstance(x, (int, float)) for x in result)
+
+    # Bootstrap means should be close to original mean
+    bootstrap_mean = mean(result)
+    original_mean = mean(data)
+    assert abs(bootstrap_mean - original_mean) < 0.5  # Should be reasonably close
+
+
 if __name__ == "__main__":
     test_acc_mutual_info_slicing()
     test_acc_mutual_info_different_predictions()
     test_acc_mutual_info_without_metric()
+    test_bootstrap_internal_no_mp()
     print("All tests passed!")