Skip to content

Commit 23316c2

Browse files
authored
Merge pull request #439 from aai-institute/feature/antithetic-sampling
Improve antithetic sampling
2 parents 3d0e579 + c804a4a commit 23316c2

File tree

4 files changed

+60
-7
lines changed

4 files changed

+60
-7
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
- Using pytest-xdist for faster local tests
66
[PR #440](https://github.com/aai-institute/pyDVL/pull/440)
7+
- Added `AntitheticPermutationSampler`
8+
[PR #439](https://github.com/aai-institute/pyDVL/pull/439)
79
- Implementation of Data-OOB by @BastienZim
810
[PR #426](https://github.com/aai-institute/pyDVL/pull/426),
911
[PR $431](https://github.com/aai-institute/pyDVL/pull/431)

docs/assets/pydvl.bib

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ @article{benmerzoug_re_2023
2424
doi = {10.5281/zenodo.8173733},
2525
url = {https://zenodo.org/record/8173733},
2626
urldate = {2023-08-27},
27-
abstract = {Replication}
27+
abstract = {We investigate the results of [1] in the field of data valuation. We repeat their experiments and conclude that the (Monte Carlo) Least Core is sensitive to important characteristics of the ML problem of interest, making it difficult to apply.},
2828
}
2929

3030
@article{castro_polynomial_2009,
@@ -198,6 +198,21 @@ @inproceedings{kwon_efficient_2021
198198
langid = {english}
199199
}
200200

201+
@article{mitchell_sampling_2022,
202+
title = {Sampling {{Permutations}} for {{Shapley Value Estimation}}},
203+
author = {Mitchell, Rory and Cooper, Joshua and Frank, Eibe and Holmes, Geoffrey},
204+
date = {2022},
205+
journaltitle = {Journal of Machine Learning Research},
206+
shortjournal = {J. Mach. Learn. Res.},
207+
volume = {23},
208+
number = {43},
209+
pages = {1--46},
210+
issn = {1533-7928},
211+
url = {http://jmlr.org/papers/v23/21-0439.html},
212+
urldate = {2022-10-23},
213+
abstract = {Game-theoretic attribution techniques based on Shapley values are used to interpret black-box machine learning models, but their exact calculation is generally NP-hard, requiring approximation methods for non-trivial models. As the computation of Shapley values can be expressed as a summation over a set of permutations, a common approach is to sample a subset of these permutations for approximation. Unfortunately, standard Monte Carlo sampling methods can exhibit slow convergence, and more sophisticated quasi-Monte Carlo methods have not yet been applied to the space of permutations. To address this, we investigate new approaches based on two classes of approximation methods and compare them empirically. First, we demonstrate quadrature techniques in a RKHS containing functions of permutations, using the Mallows kernel in combination with kernel herding and sequential Bayesian quadrature. The RKHS perspective also leads to quasi-Monte Carlo type error bounds, with a tractable discrepancy measure defined on permutations. Second, we exploit connections between the hypersphere S d−2 Sd−2 and permutations to create practical algorithms for generating permutation samples with good properties. Experiments show the above techniques provide significant improvements for Shapley value estimates over existing methods, converging to a smaller RMSE in the same number of model evaluations.}
214+
}
215+
201216
@inproceedings{okhrati_multilinear_2021,
202217
title = {A {{Multilinear Sampling Algorithm}} to {{Estimate Shapley Values}}},
203218
booktitle = {2020 25th {{International Conference}} on {{Pattern Recognition}} ({{ICPR}})},

src/pydvl/value/sampler.py

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
compute any semi-value, in particular Shapley and Beta values, and Banzhaf
2828
indices.
2929
30-
# Slicing of samplers
30+
## Slicing of samplers
3131
3232
The samplers can be sliced for parallel computation. For those which are
3333
embarrassingly parallel, this is done by slicing the set of "outer" indices and
@@ -36,6 +36,15 @@
3636
and [UniformSampler][pydvl.value.sampler.UniformSampler]. In contrast, slicing a
3737
[PermutationSampler][pydvl.value.sampler.PermutationSampler] creates a new
3838
sampler which iterates over the same indices.
39+
40+
41+
## References
42+
43+
[^1]: <a name="mitchell_sampling_2022"></a>Mitchell, Rory, Joshua Cooper, Eibe
44+
Frank, and Geoffrey Holmes. [Sampling Permutations for Shapley Value
45+
Estimation](http://jmlr.org/papers/v23/21-0439.html). Journal of Machine
46+
Learning Research 23, no. 43 (2022): 1–46.
47+
3948
"""
4049

4150
from __future__ import annotations
@@ -315,18 +324,19 @@ class AntitheticSampler(StochasticSamplerMixin, PowersetSampler[IndexT]):
315324
"""An iterator to perform uniform random sampling of subsets, and their
316325
complements.
317326
318-
Works as :class:`~pydvl.value.sampler.UniformSampler`, but for every tuple
319-
$(i,S)$, it subsequently returns $(i,S^c)$, where $S^c$ is the complement of
320-
the set $S$, including the index $i$ itself.
327+
Works as [UniformSampler][pydvl.value.sampler.UniformSampler], but for every
328+
tuple $(i,S)$, it subsequently returns $(i,S^c)$, where $S^c$ is the
329+
complement of the set $S$ in the set of indices, excluding $i$.
321330
"""
322331

323332
def __iter__(self) -> Iterator[SampleT]:
324333
while True:
325334
for idx in self.iterindices():
326-
subset = random_subset(self.complement([idx]), seed=self._rng)
335+
_complement = self.complement([idx])
336+
subset = random_subset(_complement, seed=self._rng)
327337
yield idx, subset
328338
self._n_samples += 1
329-
yield idx, self.complement(np.concatenate((subset, np.array([idx]))))
339+
yield idx, np.setxor1d(_complement, subset)
330340
self._n_samples += 1
331341
if self._n_samples == 0: # Empty index set
332342
break
@@ -372,6 +382,29 @@ def weight(cls, n: int, subset_len: int) -> float:
372382
return n * math.comb(n - 1, subset_len) if n > 0 else 1.0
373383

374384

385+
class AntitheticPermutationSampler(PermutationSampler[IndexT]):
386+
"""Samples permutations like
387+
[PermutationSampler][pydvl.value.sampler.PermutationSampler], but after
388+
each permutation, it returns the same permutation in reverse order.
389+
390+
This sampler was suggested in (Mitchell et al. 2022)<sup><a
391+
href="#mitchell_sampling_2022">1</a></sup>
392+
393+
!!! tip "New in version 0.7.1"
394+
"""
395+
396+
def __iter__(self) -> Iterator[SampleT]:
397+
while True:
398+
permutation = self._rng.permutation(self._indices)
399+
for perm in permutation, permutation[::-1]:
400+
for i, idx in enumerate(perm):
401+
yield idx, perm[:i]
402+
self._n_samples += 1
403+
404+
if self._n_samples == 0: # Empty index set
405+
break
406+
407+
375408
class DeterministicPermutationSampler(PermutationSampler[IndexT]):
376409
"""Samples all n! permutations of the indices deterministically, and
377410
iterates through them, returning sets as required for the permutation-based

tests/value/test_semivalues.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pydvl.parallel.config import ParallelConfig
88
from pydvl.utils.types import Seed
99
from pydvl.value.sampler import (
10+
AntitheticPermutationSampler,
1011
AntitheticSampler,
1112
DeterministicPermutationSampler,
1213
DeterministicUniformSampler,
@@ -36,6 +37,7 @@
3637
UniformSampler,
3738
PermutationSampler,
3839
AntitheticSampler,
40+
AntitheticPermutationSampler,
3941
],
4042
)
4143
@pytest.mark.parametrize("coefficient", [shapley_coefficient, beta_coefficient(1, 1)])
@@ -112,6 +114,7 @@ def test_shapley_batch_size(
112114
UniformSampler,
113115
PermutationSampler,
114116
AntitheticSampler,
117+
AntitheticPermutationSampler,
115118
],
116119
)
117120
def test_banzhaf(

0 commit comments

Comments
 (0)