Skip to content

Commit 64ed463

Browse files
committed
Merge branch 'develop' into fix/missing-tests
# Conflicts: # CHANGELOG.md # src/pydvl/value/shapley/gt.py # src/pydvl/value/shapley/owen.py # tests/value/shapley/test_montecarlo.py
2 parents 4f18ec6 + 783b0a0 commit 64ed463

File tree

15 files changed

+214
-197
lines changed

15 files changed

+214
-197
lines changed

CHANGELOG.md

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,22 @@
55
- Fixed parallel and antithetic Owen sampling for Shapley values. Simplified
66
and extended tests.
77
[PR #267](https://github.com/appliedAI-Initiative/pyDVL/pull/267)
8-
- Added `Scorer` class for a cleaner interface. Fix minor bugs around
9-
Group-Testing Shapley and add more tests.
8+
- Added `Scorer` class for a cleaner interface. Fixed minor bugs around
9+
Group-Testing Shapley, added more tests and switched to cvxpy for the solver.
1010
[PR #264](https://github.com/appliedAI-Initiative/pyDVL/pull/264)
1111
- Generalised stopping criteria for valuation algorithms. Improved classes
1212
`ValuationResult` and `Status` with more operations. Some minor issues fixed.
1313
[PR #252](https://github.com/appliedAI-Initiative/pyDVL/pull/250)
14-
- Operations on `ValuationResult` and `Status` and cleanup
15-
[PR #248](https://github.com/appliedAI-Initiative/pyDVL/pull/248)
16-
- Splitting of problem preparation and solution in Least Core computation.
14+
- Fixed a bug whereby `compute_shapley_values` would only spawn one process when
15+
using `n_jobs=-1` and Monte Carlo methods.
16+
[PR #270](https://github.com/appliedAI-Initiative/pyDVL/pull/270)
17+
- Bugfix in `RayParallelBackend`: wrong semantics for `kwargs`.
18+
[PR #268](https://github.com/appliedAI-Initiative/pyDVL/pull/268)
19+
- Splitting of problem preparation and solution in Least-Core computation.
1720
Umbrella function for LC methods.
18-
[PR #257](https://github.com/appliedAI-Initiative/pyDVL/pull/257)
21+
[PR #257](https://github.com/appliedAI-Initiative/pyDVL/pull/257)
22+
- Operations on `ValuationResult` and `Status` and some cleanup
23+
[PR #248](https://github.com/appliedAI-Initiative/pyDVL/pull/248)
1924
- **Bug fix and minor improvements**: Fixes bug in TMCS with remote Ray cluster,
2025
raises an error for dummy sequential parallel backend with TMCS, clones model
2126
inside `Utility` before fitting by default, with flag `clone_before_fit`
@@ -34,8 +39,8 @@
3439
- **Breaking change:** Introduces a class ValuationResult to gather and inspect
3540
results from all valuation algorithms
3641
[PR #214](https://github.com/appliedAI-Initiative/pyDVL/pull/214)
37-
- Fixes bug in Influence calculation with multi-dimensional input and adds
38-
new example notebook
42+
- Fixes bug in Influence calculation with multidimensional input and adds new
43+
example notebook
3944
[PR #195](https://github.com/appliedAI-Initiative/pyDVL/pull/195)
4045
- **Breaking change**: Passes the input to `MapReduceJob` at initialization,
4146
removes `chunkify_inputs` argument from `MapReduceJob`, removes `n_runs`

docs/30-data-valuation.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ $$
401401

402402
Usage follows the same pattern as every other Shapley method, but with the
403403
addition of an ``epsilon`` parameter required for the solution of the CSP. It
404-
shouldbe the same value used to compute the minimum number of samples required.
404+
should be the same value used to compute the minimum number of samples required.
405405
This can be done with :func:`~pydvl.value.shapley.gt.num_samples_eps_delta`, but
406406
note that the number returned will be huge! In practice, fewer samples can be
407407
enough, but the actual number will strongly depend on the utility, in particular

src/pydvl/utils/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,7 @@ def load_spotify_dataset(
524524
if file_path.exists():
525525
data = pd.read_csv(file_path)
526526
else:
527-
url = "https://github.com/appliedAI-Initiative/pyDVL/blob/develop/data/top_hits_spotify_dataset.csv"
527+
url = "https://raw.githubusercontent.com/appliedAI-Initiative/pyDVL/develop/data/top_hits_spotify_dataset.csv"
528528
data = pd.read_csv(url)
529529
data.to_csv(file_path, index=False)
530530

src/pydvl/utils/parallel/actor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class RayActorWrapper:
4141
def __init__(self, actor_class: Type, config: ParallelConfig, *args, **kwargs):
4242
parallel_backend = cast(RayParallelBackend, init_parallel_backend(config))
4343
remote_cls = parallel_backend.wrap(actor_class)
44-
self.actor_handle = remote_cls.remote(*args, **kwargs)
44+
self.actor_handle = remote_cls(*args, **kwargs)
4545

4646
def remote_caller(method_name: str):
4747
# Wrapper for remote class' methods to mimic local calls

src/pydvl/utils/parallel/backend.py

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,26 @@
1-
import functools
21
import os
32
from abc import ABCMeta, abstractmethod
43
from dataclasses import asdict
5-
from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union
4+
from typing import (
5+
Any,
6+
Callable,
7+
Dict,
8+
Iterable,
9+
List,
10+
Optional,
11+
Tuple,
12+
Type,
13+
TypeVar,
14+
Union,
15+
)
616

717
import ray
818
from ray import ObjectRef
919
from ray.remote_function import RemoteFunction
1020

1121
from ..config import ParallelConfig
1222

13-
__all__ = [
14-
"init_parallel_backend",
15-
]
23+
__all__ = ["init_parallel_backend", "effective_n_jobs", "available_cpus"]
1624

1725
T = TypeVar("T")
1826

@@ -63,7 +71,7 @@ def put(self, v: Any, *args, **kwargs) -> Any:
6371
...
6472

6573
@abstractmethod
66-
def wrap(self, *args, **kwargs) -> Any:
74+
def wrap(self, fun: Callable, **kwargs) -> Callable:
6775
...
6876

6977
@abstractmethod
@@ -104,9 +112,11 @@ def get(self, v: Any, *args, **kwargs):
104112
def put(self, v: Any, *args, **kwargs) -> Any:
105113
return v
106114

107-
def wrap(self, *args, **kwargs) -> Any:
108-
assert len(args) == 1
109-
return functools.partial(args[0], **kwargs)
115+
def wrap(self, fun: Callable, **kwargs) -> Callable:
116+
"""Wraps a function for sequential execution.
117+
118+
This is a noop and kwargs are ignored."""
119+
return fun
110120

111121
def wait(self, v: Any, *args, **kwargs) -> Tuple[list, list]:
112122
return v, []
@@ -151,8 +161,17 @@ def put(self, v: T, *args, **kwargs) -> Union["ObjectRef[T]", T]:
151161
except TypeError:
152162
return v # type: ignore
153163

154-
def wrap(self, *args, **kwargs) -> RemoteFunction:
155-
return ray.remote(*args, **kwargs) # type: ignore
164+
def wrap(self, fun: Callable, **kwargs) -> Callable:
165+
"""Wraps a function as a ray remote.
166+
167+
:param fun: the function to wrap
168+
:param kwargs: keyword arguments to pass to @ray.remote
169+
170+
:return: The `.remote` method of the ray `RemoteFunction`.
171+
"""
172+
if len(kwargs) > 1:
173+
return ray.remote(**kwargs)(fun).remote # type: ignore
174+
return ray.remote(fun).remote # type: ignore
156175

157176
def wait(
158177
self,
@@ -213,3 +232,25 @@ def available_cpus() -> int:
213232
if system() != "Linux":
214233
return os.cpu_count() or 1
215234
return len(os.sched_getaffinity(0))
235+
236+
237+
def effective_n_jobs(n_jobs: int, config: ParallelConfig = ParallelConfig()) -> int:
238+
"""Returns the effective number of jobs.
239+
240+
This number may vary depending on the parallel backend and the resources
241+
available.
242+
243+
:param n_jobs: the number of jobs requested. If -1, the number of available
244+
CPUs is returned.
245+
:param config: instance of :class:`~pydvl.utils.config.ParallelConfig` with
246+
cluster address, number of cpus, etc.
247+
:return: the effective number of jobs, guaranteed to be >= 1.
248+
:raises RuntimeError: if the effective number of jobs returned by the backend
249+
is < 1.
250+
"""
251+
parallel_backend = init_parallel_backend(config)
252+
if (eff_n_jobs := parallel_backend.effective_n_jobs(n_jobs)) < 1:
253+
raise RuntimeError(
254+
f"Invalid number of jobs {eff_n_jobs} obtained from parallel backend {config.backend}"
255+
)
256+
return eff_n_jobs

src/pydvl/utils/parallel/map_reduce.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,10 +219,19 @@ def reduce(self, chunks: List["ObjectRef[R]"]) -> R:
219219
return result # type: ignore
220220

221221
def _wrap_function(self, func: Callable, **kwargs) -> Callable:
222-
remote_func = self.parallel_backend.wrap(
222+
"""Wraps a function with a timeout and remote arguments and puts it on
223+
the remote backend.
224+
225+
:param func: Function to wrap
226+
:param kwargs: Additional keyword arguments to pass to the backend
227+
wrapper. These are *not* arguments for the wrapped function.
228+
:return: Remote function that can be called with the same arguments as
229+
the wrapped function. Depending on the backend, this may simply be
230+
the function itself.
231+
"""
232+
return self.parallel_backend.wrap(
223233
_wrap_func_with_remote_args(func, timeout=self.timeout), **kwargs
224234
)
225-
return getattr(remote_func, "remote", remote_func) # type: ignore
226235

227236
def _backpressure(
228237
self, jobs: List[ObjectRef], n_dispatched: int, n_finished: int

src/pydvl/value/least_core/common.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,6 @@ def lc_solve_problem(
3737
:func:`~pydvl.value.least_core.montecarlo.montecarlo_least_core` for
3838
argument descriptions.
3939
"""
40-
if options is None:
41-
options = {}
4240
n = len(u.data)
4341

4442
if np.any(np.isnan(problem.utility_values)):

src/pydvl/value/least_core/montecarlo.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pydvl.utils.config import ParallelConfig
88
from pydvl.utils.numeric import random_powerset
99
from pydvl.utils.parallel import MapReduceJob
10+
from pydvl.utils.parallel.backend import effective_n_jobs
1011
from pydvl.utils.progress import maybe_progress
1112
from pydvl.utils.utility import Utility
1213
from pydvl.value.least_core.common import LeastCoreProblem, lc_solve_problem
@@ -136,7 +137,7 @@ def mclc_prepare_problem(
136137
)
137138
n_iterations = 2**n
138139

139-
iterations_per_job = max(1, n_iterations // n_jobs)
140+
iterations_per_job = max(1, n_iterations // effective_n_jobs(n_jobs, config))
140141

141142
map_reduce_job: MapReduceJob["Utility", "LeastCoreProblem"] = MapReduceJob(
142143
inputs=u,

src/pydvl/value/shapley/__init__.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -61,20 +61,23 @@ def compute_shapley_values(
6161
:func:`~pydvl.value.shapley.montecarlo.truncated_montecarlo_shapley`.
6262
- ``owen_sampling``: Uses the Owen continuous extension of the utility
6363
function to the unit cube. Implemented in
64-
:func:`~pydvl.value.shapley.montecarlo.owen_sampling_shapley`.
65-
This method requires an additional parameter `q_max` for the number of
66-
subdivisions of the unit interval to use for integration.
64+
:func:`~pydvl.value.shapley.montecarlo.owen_sampling_shapley`. This
65+
method does not take a :class:`~pydvl.value.stopping.StoppingCriterion`
66+
but instead requires a parameter ``q_max`` for the number of subdivisions
67+
of the unit interval to use for integration, and another parameter
68+
``n_samples`` for the number of subsets to sample for each $q$.
6769
- ``owen_halved``: Same as 'owen_sampling' but uses correlated samples in the
6870
expectation. Implemented in
6971
:func:`~pydvl.value.shapley.montecarlo.owen_sampling_shapley`.
7072
This method requires an additional parameter `q_max` for the number of
71-
subdivisions of the interval [0,0.5] to use for integration.
73+
subdivisions of the interval [0,0.5] to use for integration, and another
74+
parameter ``n_samples`` for the number of subsets to sample for each $q$.
7275
- ``group_testing``: estimates differences of Shapley values and solves a
7376
constraint satisfaction problem. High sample complexity, not recommended.
74-
Implemented in :func:`~pydvl.value.shapley.gt.group_testing_shapley`. Only
75-
accepts :class:`~pydvl.value.stopping.MaxUpdates` (use
76-
:func:`~pydvl.value.shapley.gt.num_samples_eps_delta` to compute a bound)
77-
and :class:`~pydvl.value.stopping.MaxTime` as stopping criteria.
77+
Implemented in :func:`~pydvl.value.shapley.gt.group_testing_shapley`. This
78+
method does not take a :class:`~pydvl.value.stopping.StoppingCriterion`
79+
but instead requires a parameter ``n_samples`` for the number of
80+
iterations to run.
7881
7982
Additionally, one can use model-specific methods:
8083
@@ -126,8 +129,8 @@ def compute_shapley_values(
126129
elif mode == ShapleyMode.PermutationExact:
127130
return permutation_exact_shapley(u, progress=progress)
128131
elif mode == ShapleyMode.Owen or mode == ShapleyMode.OwenAntithetic:
129-
if kwargs.get("n_iterations") is None:
130-
raise ValueError("n_iterations cannot be None for Owen methods")
132+
if kwargs.get("n_samples") is None:
133+
raise ValueError("n_samples cannot be None for Owen methods")
131134
if kwargs.get("max_q") is None:
132135
raise ValueError("Owen Sampling requires max_q for the outer integral")
133136

@@ -138,17 +141,17 @@ def compute_shapley_values(
138141
)
139142
return owen_sampling_shapley(
140143
u,
141-
n_iterations=int(kwargs.get("n_iterations", -1)),
144+
n_samples=int(kwargs.get("n_samples", -1)),
142145
max_q=int(kwargs.get("max_q", -1)),
143146
method=method,
144147
n_jobs=n_jobs,
145148
)
146149
elif mode == ShapleyMode.KNN:
147150
return knn_shapley(u, progress=progress)
148151
elif mode == ShapleyMode.GroupTesting:
149-
n_iterations = kwargs.pop("n_iterations")
150-
if n_iterations is None:
151-
raise ValueError("n_iterations cannot be None for Group Testing")
152+
n_samples = kwargs.pop("n_samples")
153+
if n_samples is None:
154+
raise ValueError("n_samples cannot be None for Group Testing")
152155
epsilon = kwargs.pop("epsilon")
153156
if epsilon is None:
154157
raise ValueError("Group Testing requires error bound epsilon")
@@ -157,7 +160,7 @@ def compute_shapley_values(
157160
u,
158161
epsilon=epsilon,
159162
delta=delta,
160-
n_iterations=n_iterations,
163+
n_samples=n_samples,
161164
n_jobs=n_jobs,
162165
progress=progress,
163166
**kwargs,

0 commit comments

Comments
 (0)