aai-institute
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 0 deletions b/‎README.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/30-data-valuation.rst‎
Lines changed: 116 additions & 20 deletions b/‎docs/30-data-valuation.rst‎
Lines changed: 116 additions & 20 deletions
diff --git a/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/pydvl/influence/conjugate_gradient.py‎
Lines changed: 0 additions & 1 deletion b/‎src/pydvl/influence/conjugate_gradient.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/pydvl/utils/numeric.py‎
Lines changed: 43 additions & 18 deletions b/‎src/pydvl/utils/numeric.py‎
Lines changed: 43 additions & 18 deletions
diff --git a/‎src/pydvl/value/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/pydvl/value/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -2,6 +2,9 @@
 
 ## Unreleased
 
+- **New method**: Implements generalised semi-values for data valuation,
+  including Data Banzhaf and Beta Shapley, with configurable sampling strategies
+  [PR #319](https://github.com/appliedAI-Initiative/pyDVL/pull/319)
 - Adds kwargs parameter to `from_array` and `from_sklearn`
   Dataset and GroupedDataset class methods
   [PR #316](https://github.com/appliedAI-Initiative/pyDVL/pull/316)
 
@@ -54,6 +54,13 @@ methods from the following papers:
   [Towards Efficient Data Valuation Based on the Shapley Value](http://proceedings.mlr.press/v89/jia19a.html).
   In 22nd International Conference on Artificial Intelligence and Statistics,
   1167–76. PMLR, 2019.
+- Wang, Jiachen T., and Ruoxi Jia. 
+  [Data Banzhaf: A Robust Data Valuation Framework for Machine Learning](https://doi.org/10.48550/arXiv.2205.15466).
+  arXiv, October 22, 2022.
+- Kwon, Yongchan, and James Zou.
+  [Beta Shapley: A Unified and Noise-Reduced Data Valuation Framework for Machine Learning](http://arxiv.org/abs/2110.14049).
+  In Proceedings of the 25th International Conference on Artificial Intelligence
+  and Statistics (AISTATS) 2022, Vol. 151. Valencia, Spain: PMLR, 2022.
 
 Influence Functions compute the effect that single points have on an estimator /
 model. We implement methods from the following papers:
 
@@ -241,6 +241,7 @@ v_u(x_i) = \frac{1}{n} \sum_{S \subseteq D \setminus \{x_i\}}
 .. code-block:: python
 
    from pydvl.value import compute_shapley_value
+
    utility = Utility(...)
    values = compute_shapley_values(utility, mode="combinatorial_exact")
    df = values.to_dataframe(column='value')
@@ -264,7 +265,8 @@ same pattern:
 .. code-block:: python
 
    from pydvl.utils import Dataset, Utility
-   from pydvl.value.shapley import compute_shapley_values
+   from pydvl.value import compute_shapley_values
+
    model = ...
    data = Dataset(...)
    utility = Utility(model, data)
@@ -303,7 +305,8 @@ values in pyDVL. First construct the dataset and utility, then call
 .. code-block:: python
 
    from pydvl.utils import Dataset, Utility
-   from pydvl.value.shapley import compute_shapley_values
+   from pydvl.value import compute_shapley_values
+
    model = ...
    dataset = Dataset(...)
    utility = Utility(data, model)
@@ -329,11 +332,11 @@ It uses permutations over indices instead of subsets:
 
 $$
 v_u(x_i) = \frac{1}{n!} \sum_{\sigma \in \Pi(n)}
-[u(\sigma_{i-1} \cup {i}) − u(\sigma_{i})]
+[u(\sigma_{:i} \cup \{i\}) − u(\sigma_{:i})]
 ,$$
 
-where $\sigma_i$ denotes the set of indices in permutation sigma up until the
-position of index $i$. To approximate this sum (with $\mathcal{O}(n!)$ terms!)
+where $\sigma_{:i}$ denotes the set of indices in permutation sigma before the
+position where $i$ appears. To approximate this sum (with $\mathcal{O}(n!)$ terms!)
 one uses Monte Carlo sampling of permutations, something which has surprisingly
 low sample complexity. By adding early stopping, the result is the so-called
 **Truncated Monte Carlo Shapley** (:footcite:t:`ghorbani_data_2019`), which is
@@ -342,7 +345,7 @@ efficient enough to be useful in some applications.
 .. code-block:: python
 
    from pydvl.utils import Dataset, Utility
-   from pydvl.value.shapley import compute_shapley_values
+   from pydvl.value import compute_shapley_values
 
    model = ...
    data = Dataset(...)
@@ -364,7 +367,7 @@ and can be used in pyDVL with:
 .. code-block:: python
 
    from pydvl.utils import Dataset, Utility
-   from pydvl.value.shapley import compute_shapley_values
+   from pydvl.value import compute_shapley_values
    from sklearn.neighbors import KNeighborsClassifier
 
    model = KNeighborsClassifier(n_neighbors=5)
@@ -410,7 +413,7 @@ its variance.
 .. code-block:: python
 
    from pydvl.utils import Dataset, Utility
-   from pydvl.value.shapley import compute_shapley_values
+   from pydvl.value import compute_shapley_values
 
    model = ...
    data = Dataset(...)
@@ -487,11 +490,12 @@ As such it returns as exact a value as the utility function allows
 .. code-block:: python
 
    from pydvl.utils import Dataset, Utility
-   from pydvl.value.least_core import exact_least_core
+   from pydvl.value import compute_least_core_values
+
    model = ...
    dataset = Dataset(...)
    utility = Utility(data, model)
-   values = exact_least_core(utility)
+   values = compute_least_core_values(utility, mode="exact")
 
 Monte Carlo Least Core
 ----------------------
@@ -515,16 +519,20 @@ where $e^{*}$ is the optimal least core subsidy.
 .. code-block:: python
 
    from pydvl.utils import Dataset, Utility
-   from pydvl.value.least_core import montecarlo_least_core
+   from pydvl.value import compute_least_core_values
+
    model = ...
    dataset = Dataset(...)
    n_iterations = ...
    utility = Utility(data, model)
-   values = montecarlo_least_core(utility, n_iterations=n_iterations)
+   values = compute_least_core_values(
+       utility, mode="montecarlo", n_iterations=n_iterations
+   )
 
 .. note::
 
-   ``n_iterations`` needs to be at least equal to the number of data points.
+   Although any number is supported, it is best to choose ``n_iterations`` to be
+   at least equal to the number of data points.
 
 Because computing the Least Core values requires the solution of a linear and a
 quadratic problem *after* computing all the utility values, we offer the
@@ -538,6 +546,7 @@ list of problems to solve, then solve them in parallel with
 
    from pydvl.utils import Dataset, Utility
    from pydvl.value.least_core import mclc_prepare_problem, lc_solve_problems
+
    model = ...
    dataset = Dataset(...)
    n_iterations = ...
@@ -548,15 +557,102 @@ list of problems to solve, then solve them in parallel with
    values = lc_solve_problems(problems)
 
 
-Other methods
-=============
+Semi-values
+===========
+
+Shapley values are a particular case of a more general concept called semi-value,
+which is a generalization to different weighting schemes. A **semi-value** is
+any valuation function with the form:
+
+$$
+v\_\text{semi}(i) = \sum_{i=1}^n w(k)
+\sum_{S \subset D\_{-i}^{(k)}} [U(S\_{+i})-U(S)],
+$$
+
+where the coefficients $w(k)$ satisfy the property:
+
+$$\sum_{k=1}^n w(k) = 1.$$
+
+Two instances of this are **Banzhaf indices** (:footcite:t:`wang_data_2022`),
+and **Beta Shapley** (:footcite:t:`kwon_beta_2022`), with better numerical and
+rank stability in certain situations.
+
+.. note::
+
+   Shapley values are a particular case of semi-values and can therefore also be
+   computed with the methods described here. However, as of version 0.5.1, we
+   recommend using :func:`~pydvl.value.shapley.compute_shapley_values` instead,
+   in particular because it implements truncated Monte Carlo sampling for faster
+   computation.
+
+
+Beta Shapley
+^^^^^^^^^^^^
+
+For some machine learning applications, where the utility is typically the
+performance when trained on a set $S \subset D$, diminishing returns are often
+observed when computing the marginal utility of adding a new data point.
+
+Beta Shapley is a weighting scheme that uses the Beta function to place more
+weight on subsets deemed to be more informative. The weights are defined as:
+
+$$
+w(k) := \frac{B(k+\beta, n-k+1+\alpha)}{B(\alpha, \beta)},
+$$
+
+where $B$ is the `Beta function <https://en.wikipedia.org/wiki/Beta_function>`_,
+and $\alpha$ and $\beta$ are parameters that control the weighting of the
+subsets. Setting both to 1 recovers Shapley values, and setting $\alpha = 1$, and
+$\beta = 16$ is reported in :footcite:t:`kwon_beta_2022` to be a good choice for
+some applications. See however :ref:`banzhaf indices` for an alternative choice
+of weights which is reported to work better.
+
+.. code-block:: python
+
+   from pydvl.utils import Dataset, Utility
+   from pydvl.value import compute_semivalues
+
+   model = ...
+   data = Dataset(...)
+   utility = Utility(model, data)
+   values = compute_semivalues(
+       u=utility, mode="beta_shapley", done=MaxUpdates(500), alpha=1, beta=16
+   )
+
+.. _banzhaf indices:
 
-There are other game-theoretic concepts in pyDVL's roadmap, based on the notion
-of semivalue, which is a generalization to different weighting schemes:
-in particular **Banzhaf indices** and **Beta Shapley**, with better numerical
-and rank stability in certain situations.
+Banzhaf indices
+^^^^^^^^^^^^^^^
 
-Contributions are welcome!
+As noted below in :ref:`problems of data values`, the Shapley value can be very
+sensitive to variance in the utility function. For machine learning applications,
+where the utility is typically the performance when trained on a set $S \subset
+D$, this variance is often largest for smaller subsets $S$. It is therefore
+reasonable to try reducing the relative contribution of these subsets with
+adequate weights.
+
+One such choice of weights is the Banzhaf index, which is defined as the
+constant:
+
+$$w(k) := 2^{n-1},$$
+
+for all set sizes $k$. The intuition for picking a constant weight is that for
+any choice of weight function $w$, one can always construct a utility with
+higher variance where $w$ is greater. Therefore, in a worst-case sense, the best
+one can do is to pick a constant weight.
+
+The authors of :footcite:t:`wang_data_2022` show that Banzhaf indices are more
+robust to variance in the utility function than Shapley and Beta Shapley values.
+
+.. code-block:: python
+
+   from pydvl.utils import Dataset, Utility
+   from pydvl.value import compute_semivalues
+
+   model = ...
+   data = Dataset(...)
+   utility = Utility(model, data)
+   values = compute_semivalues( u=utility, mode="banzhaf", done=MaxUpdates(500))
 
 
 .. _problems of data values:
 
@@ -334,7 +334,7 @@ def lineno_from_object_name(source_file, object_name):
 
 # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
 html_show_copyright = True
-copyright = "2022 AppliedAI Institute gGmbH"
+copyright = "AppliedAI Institute gGmbH"
 
 # If true, an OpenSearch description file will be output, and all pages will
 # contain a <link> tag referring to it.  The value of this option must be the
 
@@ -123,7 +123,6 @@ def batched_preconditioned_conjugate_gradient(
     atol = np.linalg.norm(b, axis=1) * rtol
 
     while iteration < max_iterations:
-
         # remaining fields
         iteration += 1
         not_yet_converged_indices = np.argwhere(np.logical_not(converged))[:, 0]
 
@@ -2,16 +2,14 @@
 This module contains routines for numerical computations used across the
 library.
 """
+from __future__ import annotations
 
 from itertools import chain, combinations
-from typing import Collection, Generator, Iterator, Optional, Tuple, TypeVar
+from typing import Collection, Generator, Iterator, Optional, Tuple, TypeVar, overload
 
 import numpy as np
 from numpy.typing import NDArray
 
-FloatOrArray = TypeVar("FloatOrArray", float, NDArray[np.float_])
-IntOrArray = TypeVar("IntOrArray", int, NDArray[np.int_])
-
 __all__ = [
     "running_moments",
     "linear_regression_analytical_derivative_d2_theta",
@@ -20,6 +18,7 @@
     "num_samples_permutation_hoeffding",
     "powerset",
     "random_matrix_with_condition_number",
+    "random_subset",
     "random_powerset",
     "random_subset_of_size",
     "top_k_value_accuracy",
@@ -63,6 +62,19 @@ def num_samples_permutation_hoeffding(eps: float, delta: float, u_range: float)
     return int(np.ceil(np.log(2 / delta) * 2 * u_range**2 / eps**2))
 
 
+def random_subset(s: NDArray[T], q: float = 0.5) -> NDArray[T]:
+    """Returns one subset at random from ``s``.
+
+    :param s: set to sample from
+    :param q: Sampling probability for elements. The default 0.5 yields a
+        uniform distribution over the power set of s.
+    :return: the subset
+    """
+    rng = np.random.default_rng()
+    selection = rng.uniform(size=len(s)) > q
+    return s[selection]
+
+
 def random_powerset(
     s: NDArray[T], n_samples: Optional[int] = None, q: float = 0.5
 ) -> Generator[NDArray[T], None, None]:
@@ -72,9 +84,8 @@ def random_powerset(
     See `powerset()` if you wish to deterministically generate all subsets.
 
     To generate subsets, `len(s)` Bernoulli draws with probability `q` are
-    drawn.
-    The default value of `q = 0.5` provides a uniform distribution over the
-    power set of `s`. Other choices can be used e.g. to implement
+    drawn. The default value of `q = 0.5` provides a uniform distribution over
+    the power set of `s`. Other choices can be used e.g. to implement
     :func:`Owen sampling
     <pydvl.value.shapley.montecarlo.owen_sampling_shapley>`.
 
@@ -94,19 +105,17 @@ def random_powerset(
     if q < 0 or q > 1:
         raise ValueError("Element sampling probability must be in [0,1]")
 
-    rng = np.random.default_rng()
     total = 1
     if n_samples is None:
         n_samples = np.iinfo(np.int32).max
     while total <= n_samples:
-        selection = rng.uniform(size=len(s)) > q
-        subset = s[selection]
-        yield subset
+        yield random_subset(s, q)
         total += 1
 
 
 def random_subset_of_size(s: NDArray[T], size: int) -> NDArray[T]:
-    """Samples a random subset of given size.
+    """Samples a random subset of given size uniformly from the powerset
+    of ``s``.
 
     :param s: Set to sample from
     :param size: Size of the subset to generate
@@ -221,13 +230,29 @@ def linear_regression_analytical_derivative_d_x_d_theta(
     return full_derivative / N  # type: ignore
 
 
-# FIXME: FloatOrArray doesn't really work
+@overload
+def running_moments(
+    previous_avg: float, previous_variance: float, count: int, new_value: float
+) -> Tuple[float, float]:
+    ...
+
+
+@overload
+def running_moments(
+    previous_avg: NDArray[np.float_],
+    previous_variance: NDArray[np.float_],
+    count: int,
+    new_value: NDArray[np.float_],
+) -> Tuple[NDArray[np.float_], NDArray[np.float_]]:
+    ...
+
+
 def running_moments(
-    previous_avg: FloatOrArray,
-    previous_variance: FloatOrArray,
-    count: IntOrArray,
-    new_value: FloatOrArray,
-) -> Tuple:  # [FloatOrArray, FloatOrArray]:
+    previous_avg: float | NDArray[np.float_],
+    previous_variance: float | NDArray[np.float_],
+    count: int,
+    new_value: float | NDArray[np.float_],
+) -> Tuple[float | NDArray[np.float_], float | NDArray[np.float_]]:
     """Uses Welford's algorithm to calculate the running average and variance of
      a set of numbers.
 
 
@@ -9,5 +9,7 @@
 from ..utils import Dataset, Scorer, Utility
 from .least_core import *
 from .loo import *
+from .sampler import *
+from .semivalues import *
 from .shapley import *
 from .stopping import *