Merge pull request #439 from aai-institute/feature/antithetic-sampling

mdbenito · web-flow · commit 23316c24c277 · 2023-09-30T09:28:39.000+02:00
Improve antithetic sampling
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 - Using pytest-xdist for faster local tests
   [PR #440](https://github.com/aai-institute/pyDVL/pull/440)
+- Added `AntitheticPermutationSampler`
+  [PR #439](https://github.com/aai-institute/pyDVL/pull/439)
 - Implementation of Data-OOB by @BastienZim
   [PR #426](https://github.com/aai-institute/pyDVL/pull/426), 
   [PR $431](https://github.com/aai-institute/pyDVL/pull/431)
diff --git a/docs/assets/pydvl.bib b/docs/assets/pydvl.bib
@@ -24,7 +24,7 @@ @article{benmerzoug_re_2023
   doi = {10.5281/zenodo.8173733},
   url = {https://zenodo.org/record/8173733},
   urldate = {2023-08-27},
-  abstract = {Replication}
+  abstract = {We investigate the results of [1] in the field of data valuation. We repeat their experiments and conclude that the (Monte Carlo) Least Core is sensitive to important characteristics of the ML problem of interest, making it difficult to apply.},
 }
 
 @article{castro_polynomial_2009,
@@ -198,6 +198,21 @@ @inproceedings{kwon_efficient_2021
   langid = {english}
 }
 
+@article{mitchell_sampling_2022,
+  title = {Sampling {{Permutations}} for {{Shapley Value Estimation}}},
+  author = {Mitchell, Rory and Cooper, Joshua and Frank, Eibe and Holmes, Geoffrey},
+  date = {2022},
+  journaltitle = {Journal of Machine Learning Research},
+  shortjournal = {J. Mach. Learn. Res.},
+  volume = {23},
+  number = {43},
+  pages = {1--46},
+  issn = {1533-7928},
+  url = {http://jmlr.org/papers/v23/21-0439.html},
+  urldate = {2022-10-23},
+  abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere  S d−2 Sd−2  and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
+}
+
 @inproceedings{okhrati_multilinear_2021,
   title = {A {{Multilinear Sampling Algorithm}} to {{Estimate Shapley Values}}},
   booktitle = {2020 25th {{International Conference}} on {{Pattern Recognition}} ({{ICPR}})},
diff --git a/src/pydvl/value/sampler.py b/src/pydvl/value/sampler.py
@@ -27,7 +27,7 @@
 compute any semi-value, in particular Shapley and Beta values, and Banzhaf
 indices.
 
-# Slicing of samplers
+## Slicing of samplers
 
 The samplers can be sliced for parallel computation. For those which are
 embarrassingly parallel, this is done by slicing the set of "outer" indices and
@@ -36,6 +36,15 @@
 and [UniformSampler][pydvl.value.sampler.UniformSampler]. In contrast, slicing a
 [PermutationSampler][pydvl.value.sampler.PermutationSampler] creates a new
 sampler which iterates over the same indices.
+
+
+## References
+
+[^1]: <a name="mitchell_sampling_2022"></a>Mitchell, Rory, Joshua Cooper, Eibe
+      Frank, and Geoffrey Holmes. [Sampling Permutations for Shapley Value
+      Estimation](http://jmlr.org/papers/v23/21-0439.html). Journal of Machine
+      Learning Research 23, no. 43 (2022): 1–46.
+
 """
 
 from __future__ import annotations
@@ -315,18 +324,19 @@ class AntitheticSampler(StochasticSamplerMixin, PowersetSampler[IndexT]):
     """An iterator to perform uniform random sampling of subsets, and their
     complements.
 
-    Works as :class:`~pydvl.value.sampler.UniformSampler`, but for every tuple
-    $(i,S)$, it subsequently returns $(i,S^c)$, where $S^c$ is the complement of
-    the set $S$, including the index $i$ itself.
+    Works as [UniformSampler][pydvl.value.sampler.UniformSampler], but for every
+    tuple $(i,S)$, it subsequently returns $(i,S^c)$, where $S^c$ is the
+    complement of the set $S$ in the set of indices, excluding $i$.
     """
 
     def __iter__(self) -> Iterator[SampleT]:
         while True:
             for idx in self.iterindices():
-                subset = random_subset(self.complement([idx]), seed=self._rng)
+                _complement = self.complement([idx])
+                subset = random_subset(_complement, seed=self._rng)
                 yield idx, subset
                 self._n_samples += 1
-                yield idx, self.complement(np.concatenate((subset, np.array([idx]))))
+                yield idx, np.setxor1d(_complement, subset)
                 self._n_samples += 1
             if self._n_samples == 0:  # Empty index set
                 break
@@ -372,6 +382,29 @@ def weight(cls, n: int, subset_len: int) -> float:
         return n * math.comb(n - 1, subset_len) if n > 0 else 1.0
 
 
+class AntitheticPermutationSampler(PermutationSampler[IndexT]):
+    """Samples permutations like
+    [PermutationSampler][pydvl.value.sampler.PermutationSampler], but after
+    each permutation, it returns the same permutation in reverse order.
+
+    This sampler was suggested in (Mitchell et al. 2022)<sup><a
+    href="#mitchell_sampling_2022">1</a></sup>
+
+    !!! tip "New in version 0.7.1"
+    """
+
+    def __iter__(self) -> Iterator[SampleT]:
+        while True:
+            permutation = self._rng.permutation(self._indices)
+            for perm in permutation, permutation[::-1]:
+                for i, idx in enumerate(perm):
+                    yield idx, perm[:i]
+                    self._n_samples += 1
+
+            if self._n_samples == 0:  # Empty index set
+                break
+
+
 class DeterministicPermutationSampler(PermutationSampler[IndexT]):
     """Samples all n! permutations of the indices deterministically, and
     iterates through them, returning sets as required for the permutation-based
diff --git a/tests/value/test_semivalues.py b/tests/value/test_semivalues.py
@@ -7,6 +7,7 @@
 from pydvl.parallel.config import ParallelConfig
 from pydvl.utils.types import Seed
 from pydvl.value.sampler import (
+    AntitheticPermutationSampler,
     AntitheticSampler,
     DeterministicPermutationSampler,
     DeterministicUniformSampler,
@@ -36,6 +37,7 @@
         UniformSampler,
         PermutationSampler,
         AntitheticSampler,
+        AntitheticPermutationSampler,
     ],
 )
 @pytest.mark.parametrize("coefficient", [shapley_coefficient, beta_coefficient(1, 1)])
@@ -112,6 +114,7 @@ def test_shapley_batch_size(
         UniformSampler,
         PermutationSampler,
         AntitheticSampler,
+        AntitheticPermutationSampler,
     ],
 )
 def test_banzhaf(