aai-institute
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 3 deletions b/‎CHANGELOG.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/pydvl/utils/numeric.py‎
Lines changed: 188 additions & 40 deletions b/‎src/pydvl/utils/numeric.py‎
Lines changed: 188 additions & 40 deletions
diff --git a/‎src/pydvl/valuation/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎src/pydvl/valuation/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/pydvl/valuation/methods/beta_shapley.py‎
Lines changed: 5 additions & 4 deletions b/‎src/pydvl/valuation/methods/beta_shapley.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/pydvl/valuation/methods/classwise_shapley.py‎
Lines changed: 5 additions & 3 deletions b/‎src/pydvl/valuation/methods/classwise_shapley.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/pydvl/valuation/methods/data_banzhaf.py‎
Lines changed: 4 additions & 2 deletions b/‎src/pydvl/valuation/methods/data_banzhaf.py‎
Lines changed: 4 additions & 2 deletions
@@ -76,6 +76,9 @@
 
 ### Changed
 
+- Switched all semi-value coefficients and sampler weights to log-space in
+  order to avoid overflows
+  [PR #643](https://github.com/aai-institute/pyDVL/pull/643)
 - Updated and rewrote some of the MSR banzhaf notebook
   [PR #641](https://github.com/aai-institute/pyDVL/pull/641)
 - Updated Least-Core notebook
@@ -84,9 +87,6 @@
   thus subsuming Variance-Reduced stratified sampling into a unified framework.
   Implemented the heuristics proposed in that paper
   [PR #641](https://github.com/aai-institute/pyDVL/pull/641)
-- Changed the way semi-value coefficients are composed with sampler weights in
-  order to avoid `OverflowError` for very small or large values
-  [PR #639](https://github.com/aai-institute/pyDVL/pull/639)
 - Uniformly distribute test points across processes for KNNShapley. Fail for
   `GroupedDataset` [PR #632](https://github.com/aai-institute/pyDVL/pull/632)
 - Introduced the concept of logical vs data indices for `Dataset`, and
 
@@ -1,6 +1,5 @@
 """
-This module contains routines for numerical computations used across the
-library.
+This module contains routines for numerical computations used across the library.
 """
 
 from __future__ import annotations
@@ -10,29 +9,31 @@
     Collection,
     Generator,
     Iterator,
-    List,
     Optional,
     Sequence,
-    Tuple,
     TypeVar,
-    overload,
 )
 
 import numpy as np
 from numpy.typing import NDArray
+from scipy.special import gammaln
 
 from pydvl.utils.types import Seed
 
 __all__ = [
     "complement",
-    "running_moments",
+    "logcomb",
+    "logexp",
+    "log_running_moments",
+    "logsumexp_two",
     "num_samples_permutation_hoeffding",
     "powerset",
     "random_matrix_with_condition_number",
     "random_subset",
     "random_powerset",
     "random_powerset_label_min",
     "random_subset_of_size",
+    "running_moments",
     "top_k_value_accuracy",
 ]
 
@@ -202,7 +203,7 @@ def random_powerset_label_min(
     unique_labels = np.unique(labels)
 
     while True:
-        subsets: List[NDArray[T]] = []
+        subsets: list[NDArray[T]] = []
         for label in unique_labels:
             label_indices = np.asarray(np.where(labels == label)[0])
             subset_size = int(
@@ -291,53 +292,51 @@ def random_matrix_with_condition_number(
     return P
 
 
-@overload
-def running_moments(
-    previous_avg: float, previous_variance: float, count: int, new_value: float
-) -> Tuple[float, float]: ...
-
-
-@overload
-def running_moments(
-    previous_avg: NDArray[np.float64],
-    previous_variance: NDArray[np.float64],
-    count: int,
-    new_value: NDArray[np.float64],
-) -> Tuple[NDArray[np.float64], NDArray[np.float64]]: ...
-
-
 def running_moments(
-    previous_avg: float | NDArray[np.float64],
-    previous_variance: float | NDArray[np.float64],
+    previous_avg: float,
+    previous_variance: float,
     count: int,
-    new_value: float | NDArray[np.float64],
-) -> Tuple[float | NDArray[np.float64], float | NDArray[np.float64]]:
-    """Uses Welford's algorithm to calculate the running average and variance of
-     a set of numbers.
+    new_value: float,
+    unbiased: bool = True,
+) -> tuple[float, float]:
+    """Calculates running average and variance of a series of numbers.
 
-    See [Welford's algorithm in wikipedia](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm)
+    See [Welford's algorithm in
+    wikipedia](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm)
 
     !!! Warning
         This is not really using Welford's correction for numerical stability
         for the variance. (FIXME)
 
     !!! Todo
-        This could be generalised to arbitrary moments. See [this paper](https://www.osti.gov/biblio/1028931)
+        This could be generalised to arbitrary moments. See [this
+        paper](https://www.osti.gov/biblio/1028931)
 
     Args:
-        previous_avg: average value at previous step
-        previous_variance: variance at previous step
-        count: number of points seen so far
-        new_value: new value in the series of numbers
-
+        previous_avg: average value at previous step.
+        previous_variance: variance at previous step.
+        count: number of points seen so far,
+        new_value: new value in the series of numbers.
+        unbiased: whether to use the unbiased variance estimator (same as `np.var` with
+            `ddof=1`).
     Returns:
         new_average, new_variance, calculated with the new count
     """
-    # broadcasted operations seem not to be supported by mypy, so we ignore the type
-    new_average = (new_value + count * previous_avg) / (count + 1)  # type: ignore
-    new_variance = previous_variance + (
-        (new_value - previous_avg) * (new_value - new_average) - previous_variance
-    ) / (count + 1)
+    delta = new_value - previous_avg
+    new_average = previous_avg + delta / (count + 1)
+
+    if unbiased:
+        if count > 0:
+            new_variance = (
+                previous_variance + delta**2 / (count + 1) - previous_variance / count
+            )
+        else:
+            new_variance = 0.0
+    else:
+        new_variance = previous_variance + (
+            delta * (new_value - new_average) - previous_variance
+        ) / (count + 1)
+
     return new_average, new_variance
 
 
@@ -359,3 +358,152 @@ def top_k_value_accuracy(
     top_k_pred_values = np.argsort(y_pred)[-k:]
     top_k_accuracy = len(np.intersect1d(top_k_exact_values, top_k_pred_values)) / k
     return top_k_accuracy
+
+
+def logcomb(n: int, k: int) -> float:
+    r"""Computes the log of the binomial coefficient (n choose k).
+
+    $$
+    \begin{array}{rcl}
+        \log\binom{n}{k} & = & \log(n!) - \log(k!) - \log((n-k)!) \\
+                         & = & \log\Gamma(n+1) - \log\Gamma(k+1) - \log\Gamma(n-k+1).
+    \end{array}
+    $$
+
+    Args:
+        n: Total number of elements
+        k: Number of elements to choose
+    Returns:
+        The log of the binomial coefficient
+        """
+    if k < 0 or k > n or n < 0:
+        raise ValueError(f"Invalid arguments: n={n}, k={k}")
+    return float(gammaln(n + 1) - gammaln(k + 1) - gammaln(n - k + 1))
+
+
+def logexp(x: float, a: float) -> float:
+    """Computes log(x^a).
+
+    Args:
+        x: Base
+        a: Exponent
+    Returns
+        a * log(x)
+    """
+    return float(a * np.log(x))
+
+
+def logsumexp_two(log_a: float, log_b: float) -> float:
+    r"""Numerically stable computation of log(exp(log_a) + exp(log_b)).
+
+    Uses standard log sum exp trick:
+
+    $$
+    \log(\exp(\log a) + \exp(\log b)) = m + \log(\exp(\log a - m) + \exp(\log b - m)),
+    $$
+
+    where $m = \max(\log a, \log b)$.
+
+    Args:
+        log_a: Log of the first value
+        log_b: Log of the second value
+    Returns:
+        The log of the sum of the exponentials
+    """
+    if log_a == -np.inf:
+        return log_b
+    if log_b == -np.inf:
+        return log_a
+    m = max(log_a, log_b)
+    return float(m + np.log(np.exp(log_a - m) + np.exp(log_b - m)))
+
+
+def log_running_moments(
+    previous_log_sum_pos: float,
+    previous_log_sum_neg: float,
+    previous_log_sum2: float,
+    count: int,
+    new_log_value: float,
+    new_sign: int,
+    unbiased: bool = True,
+) -> tuple[float, float, float, float, float]:
+    """
+    Update running moments when the new value is provided in log space,
+    allowing for negative values via an explicit sign.
+
+    Here the actual value is x = new_sign * exp(new_log_value). Rather than
+    updating the arithmetic sum S = sum(x) and S2 = sum(x^2) directly, we maintain:
+
+       L_S+ = log(sum_{i: x_i >= 0} x_i)
+       L_S- = log(sum_{i: x_i < 0} |x_i|)
+       L_S2 = log(sum_i x_i^2)
+
+    The running mean is then computed as:
+
+         mean = exp(L_S+) - exp(L_S-)
+
+    and the second moment is:
+
+         second_moment = exp(L_S2 - log(count))
+
+    so that the variance is:
+
+         variance = second_moment - mean^2
+
+    For the unbiased (sample) estimator, we scale the variance by count/(count-1)
+    when count > 1 (and define variance = 0 when count == 1).
+
+    Args:
+        previous_log_sum_pos: running log(sum of positive contributions), or -inf if none.
+        previous_log_sum_neg: running log(sum of negative contributions in absolute
+            value), or -inf if none.
+        previous_log_sum2: running log(sum of squares) so far (or -inf if none).
+        count: number of points processed so far.
+        new_log_value: log(|x_new|), where x_new is the new value.
+        new_sign: sign of the new value (should be +1, 0, or -1).
+        unbiased: if True, compute the unbiased estimator of the variance.
+
+    Returns:
+        new_mean: running mean in the linear domain.
+        new_variance: running variance in the linear domain.
+        new_log_sum_pos: updated running log(sum of positive contributions).
+        new_log_sum_neg: updated running log(sum of negative contributions).
+        new_log_sum2: updated running log(sum of squares).
+        new_count: updated count.
+    """
+
+    if count == 0:
+        if new_sign >= 0:
+            new_log_sum_pos = new_log_value
+            new_log_sum_neg = -np.inf  # No negative contribution yet.
+        else:
+            new_log_sum_pos = -np.inf
+            new_log_sum_neg = new_log_value
+        new_log_sum2 = 2 * new_log_value
+    else:
+        if new_sign >= 0:
+            new_log_sum_pos = logsumexp_two(previous_log_sum_pos, new_log_value)
+            new_log_sum_neg = previous_log_sum_neg
+        else:
+            new_log_sum_neg = logsumexp_two(previous_log_sum_neg, new_log_value)
+            new_log_sum_pos = previous_log_sum_pos
+        new_log_sum2 = logsumexp_two(previous_log_sum2, 2 * new_log_value)
+    new_count = count + 1
+
+    # Compute 1st and 2nd moments in the linear domain.
+    pos_sum = np.exp(new_log_sum_pos) if new_log_sum_pos != -np.inf else 0.0
+    neg_sum = np.exp(new_log_sum_neg) if new_log_sum_neg != -np.inf else 0.0
+    new_mean = (pos_sum - neg_sum) / new_count
+
+    second_moment = np.exp(new_log_sum2 - np.log(new_count))
+
+    # Compute variance using either the population or unbiased estimator.
+    if unbiased:
+        if new_count > 1:
+            new_variance = new_count / (new_count - 1) * (second_moment - new_mean**2)
+        else:
+            new_variance = 0.0
+    else:
+        new_variance = second_moment - new_mean**2
+
+    return new_mean, new_variance, new_log_sum_pos, new_log_sum_neg, new_log_sum2
@@ -2,4 +2,5 @@
 from pydvl.valuation.methods import *
 from pydvl.valuation.samplers import *
 from pydvl.valuation.scorers import *
+from pydvl.valuation.stopping import *
 from pydvl.valuation.utility import *
@@ -28,9 +28,10 @@ def __init__(
         self.alpha = alpha
         self.beta = beta
         self.const = sp.special.beta(alpha, beta)
+        self.log_const = sp.special.betaln(alpha, beta)
 
-    def coefficient(self, n: int, k: int, weight: float) -> float:
+    def log_coefficient(self, n: int, k: int) -> float:
         j = k + 1
-        w = sp.special.beta(j + self.beta - 1, n - j + self.alpha) / self.const
-        # return math.comb(n - 1, j - 1) * w * n * other
-        return float(w) * weight
+        return float(
+            sp.special.betaln(j + self.beta - 1, n - j + self.alpha) - self.log_const
+        )
@@ -149,10 +149,12 @@ def fit(self, data: Dataset):
         self.is_done.reset()
         self.utility.training_data = data
 
-        sample_generator = self.sampler.from_data(data)
         strategy = self.sampler.make_strategy(self.utility)
+        updater = self.sampler.result_updater(self.result)
         processor = delayed(strategy.process)
 
+        sample_generator = self.sampler.from_data(data)
+
         with Parallel(return_as="generator_unordered") as parallel:
             with make_parallel_flag() as flag:
                 delayed_evals = parallel(
@@ -162,7 +164,7 @@ def fit(self, data: Dataset):
 
                 for batch in Progress(delayed_evals, self.is_done, **self.tqdm_args):
                     for evaluation in batch:
-                        self.result.update(evaluation.idx, evaluation.update)
+                        self.result = updater(evaluation)
                         if self.is_done(self.result):
                             flag.set()
                             self.sampler.interrupt()
@@ -211,6 +213,6 @@ def _normalize(self) -> ValuationResult:
 
             sigma = np.sum(self.result.values[indices_label_set])
             if sigma != 0:
-                self.result.scale(in_class_acc / sigma, indices=indices_label_set)
+                self.result.scale(in_class_acc / sigma, data_indices=indices_label_set)
 
         return self.result
@@ -25,6 +25,8 @@
       6388–6421. PMLR, 2023.
 """
 
+import numpy as np
+
 from pydvl.valuation.methods.semivalue import SemivalueValuation
 
 __all__ = ["DataBanzhafValuation"]
@@ -35,5 +37,5 @@ class DataBanzhafValuation(SemivalueValuation):
 
     algorithm_name = "Data-Banzhaf"
 
-    def coefficient(self, n: int, k: int, weight: float) -> float:
-        return float(weight / 2 ** (n - 1))
+    def log_coefficient(self, n: int, k: int) -> float:
+        return float(-(n - 1) * np.log(2))