aai-institute
diff --git a/‎CHANGELOG.md‎
Lines changed: 13 additions & 8 deletions b/‎CHANGELOG.md‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎docs/30-data-valuation.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/30-data-valuation.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/pydvl/utils/dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎src/pydvl/utils/dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/pydvl/utils/parallel/actor.py‎
Lines changed: 1 addition & 1 deletion b/‎src/pydvl/utils/parallel/actor.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/pydvl/utils/parallel/backend.py‎
Lines changed: 52 additions & 11 deletions b/‎src/pydvl/utils/parallel/backend.py‎
Lines changed: 52 additions & 11 deletions
diff --git a/‎src/pydvl/utils/parallel/map_reduce.py‎
Lines changed: 11 additions & 2 deletions b/‎src/pydvl/utils/parallel/map_reduce.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎src/pydvl/value/least_core/common.py‎
Lines changed: 0 additions & 2 deletions b/‎src/pydvl/value/least_core/common.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/pydvl/value/least_core/montecarlo.py‎
Lines changed: 2 additions & 1 deletion b/‎src/pydvl/value/least_core/montecarlo.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/pydvl/value/shapley/__init__.py‎
Lines changed: 18 additions & 15 deletions b/‎src/pydvl/value/shapley/__init__.py‎
Lines changed: 18 additions & 15 deletions
@@ -5,17 +5,22 @@
 - Fixed parallel and antithetic Owen sampling for Shapley values. Simplified
   and extended tests.
   [PR #267](https://github.com/appliedAI-Initiative/pyDVL/pull/267)
-- Added `Scorer` class for a cleaner interface. Fix minor bugs around
-  Group-Testing Shapley and add more tests.
+- Added `Scorer` class for a cleaner interface. Fixed minor bugs around
+  Group-Testing Shapley, added more tests and switched to cvxpy for the solver.
   [PR #264](https://github.com/appliedAI-Initiative/pyDVL/pull/264)
 - Generalised stopping criteria for valuation algorithms. Improved classes
   `ValuationResult` and `Status` with more operations. Some minor issues fixed.
   [PR #252](https://github.com/appliedAI-Initiative/pyDVL/pull/250)
-- Operations on `ValuationResult` and `Status` and cleanup
-  [PR #248](https://github.com/appliedAI-Initiative/pyDVL/pull/248)
-- Splitting of problem preparation and solution in Least Core computation.
+- Fixed a bug whereby `compute_shapley_values` would only spawn one process when
+  using `n_jobs=-1` and Monte Carlo methods.
+  [PR #270](https://github.com/appliedAI-Initiative/pyDVL/pull/270)
+- Bugfix in `RayParallelBackend`: wrong semantics for `kwargs`.
+  [PR #268](https://github.com/appliedAI-Initiative/pyDVL/pull/268)
+- Splitting of problem preparation and solution in Least-Core computation.
   Umbrella function for LC methods.
-  [PR #257](https://github.com/appliedAI-Initiative/pyDVL/pull/257)
+  [PR #257](https://github.com/appliedAI-Initiative/pyDVL/pull/257) 
+- Operations on `ValuationResult` and `Status` and some cleanup
+  [PR #248](https://github.com/appliedAI-Initiative/pyDVL/pull/248)
 - **Bug fix and minor improvements**: Fixes bug in TMCS with remote Ray cluster,
   raises an error for dummy sequential parallel backend with TMCS, clones model
   inside `Utility` before fitting by default, with flag `clone_before_fit` 
@@ -34,8 +39,8 @@
 - **Breaking change:** Introduces a class ValuationResult to gather and inspect
   results from all valuation algorithms
   [PR #214](https://github.com/appliedAI-Initiative/pyDVL/pull/214)
-- Fixes bug in Influence calculation with multi-dimensional input and adds
-  new example notebook
+- Fixes bug in Influence calculation with multidimensional input and adds new
+  example notebook
   [PR #195](https://github.com/appliedAI-Initiative/pyDVL/pull/195)
 - **Breaking change**: Passes the input to `MapReduceJob` at initialization,
   removes `chunkify_inputs` argument from `MapReduceJob`, removes `n_runs`
 
@@ -401,7 +401,7 @@ $$
 
 Usage follows the same pattern as every other Shapley method, but with the
 addition of an ``epsilon`` parameter required for the solution of the CSP. It
-shouldbe the same value used to compute the minimum number of samples required.
+should be the same value used to compute the minimum number of samples required.
 This can be done with :func:`~pydvl.value.shapley.gt.num_samples_eps_delta`, but
 note that the number returned will be huge! In practice, fewer samples can be
 enough, but the actual number will strongly depend on the utility, in particular
 
@@ -524,7 +524,7 @@ def load_spotify_dataset(
     if file_path.exists():
         data = pd.read_csv(file_path)
     else:
-        url = "https://github.com/appliedAI-Initiative/pyDVL/blob/develop/data/top_hits_spotify_dataset.csv"
+        url = "https://raw.githubusercontent.com/appliedAI-Initiative/pyDVL/develop/data/top_hits_spotify_dataset.csv"
         data = pd.read_csv(url)
         data.to_csv(file_path, index=False)
 
 
@@ -41,7 +41,7 @@ class RayActorWrapper:
     def __init__(self, actor_class: Type, config: ParallelConfig, *args, **kwargs):
         parallel_backend = cast(RayParallelBackend, init_parallel_backend(config))
         remote_cls = parallel_backend.wrap(actor_class)
-        self.actor_handle = remote_cls.remote(*args, **kwargs)
+        self.actor_handle = remote_cls(*args, **kwargs)
 
         def remote_caller(method_name: str):
             # Wrapper for remote class' methods to mimic local calls
 
@@ -1,18 +1,26 @@
-import functools
 import os
 from abc import ABCMeta, abstractmethod
 from dataclasses import asdict
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
 
 import ray
 from ray import ObjectRef
 from ray.remote_function import RemoteFunction
 
 from ..config import ParallelConfig
 
-__all__ = [
-    "init_parallel_backend",
-]
+__all__ = ["init_parallel_backend", "effective_n_jobs", "available_cpus"]
 
 T = TypeVar("T")
 
@@ -63,7 +71,7 @@ def put(self, v: Any, *args, **kwargs) -> Any:
         ...
 
     @abstractmethod
-    def wrap(self, *args, **kwargs) -> Any:
+    def wrap(self, fun: Callable, **kwargs) -> Callable:
         ...
 
     @abstractmethod
@@ -104,9 +112,11 @@ def get(self, v: Any, *args, **kwargs):
     def put(self, v: Any, *args, **kwargs) -> Any:
         return v
 
-    def wrap(self, *args, **kwargs) -> Any:
-        assert len(args) == 1
-        return functools.partial(args[0], **kwargs)
+    def wrap(self, fun: Callable, **kwargs) -> Callable:
+        """Wraps a function for sequential execution.
+
+        This is a noop and kwargs are ignored."""
+        return fun
 
     def wait(self, v: Any, *args, **kwargs) -> Tuple[list, list]:
         return v, []
@@ -151,8 +161,17 @@ def put(self, v: T, *args, **kwargs) -> Union["ObjectRef[T]", T]:
         except TypeError:
             return v  # type: ignore
 
-    def wrap(self, *args, **kwargs) -> RemoteFunction:
-        return ray.remote(*args, **kwargs)  # type: ignore
+    def wrap(self, fun: Callable, **kwargs) -> Callable:
+        """Wraps a function as a ray remote.
+
+        :param fun: the function to wrap
+        :param kwargs: keyword arguments to pass to @ray.remote
+
+        :return: The `.remote` method of the ray `RemoteFunction`.
+        """
+        if len(kwargs) > 1:
+            return ray.remote(**kwargs)(fun).remote  # type: ignore
+        return ray.remote(fun).remote  # type: ignore
 
     def wait(
         self,
@@ -213,3 +232,25 @@ def available_cpus() -> int:
     if system() != "Linux":
         return os.cpu_count() or 1
     return len(os.sched_getaffinity(0))
+
+
+def effective_n_jobs(n_jobs: int, config: ParallelConfig = ParallelConfig()) -> int:
+    """Returns the effective number of jobs.
+
+    This number may vary depending on the parallel backend and the resources
+    available.
+
+    :param n_jobs: the number of jobs requested. If -1, the number of available
+        CPUs is returned.
+    :param config: instance of :class:`~pydvl.utils.config.ParallelConfig` with
+        cluster address, number of cpus, etc.
+    :return: the effective number of jobs, guaranteed to be >= 1.
+    :raises RuntimeError: if the effective number of jobs returned by the backend
+        is < 1.
+    """
+    parallel_backend = init_parallel_backend(config)
+    if (eff_n_jobs := parallel_backend.effective_n_jobs(n_jobs)) < 1:
+        raise RuntimeError(
+            f"Invalid number of jobs {eff_n_jobs} obtained from parallel backend {config.backend}"
+        )
+    return eff_n_jobs
@@ -219,10 +219,19 @@ def reduce(self, chunks: List["ObjectRef[R]"]) -> R:
         return result  # type: ignore
 
     def _wrap_function(self, func: Callable, **kwargs) -> Callable:
-        remote_func = self.parallel_backend.wrap(
+        """Wraps a function with a timeout and remote arguments and puts it on
+        the remote backend.
+
+        :param func: Function to wrap
+        :param kwargs: Additional keyword arguments to pass to the backend
+            wrapper. These are *not* arguments for the wrapped function.
+        :return: Remote function that can be called with the same arguments as
+            the wrapped function. Depending on the backend, this may simply be
+            the function itself.
+        """
+        return self.parallel_backend.wrap(
             _wrap_func_with_remote_args(func, timeout=self.timeout), **kwargs
         )
-        return getattr(remote_func, "remote", remote_func)  # type: ignore
 
     def _backpressure(
         self, jobs: List[ObjectRef], n_dispatched: int, n_finished: int
 
@@ -37,8 +37,6 @@ def lc_solve_problem(
     :func:`~pydvl.value.least_core.montecarlo.montecarlo_least_core` for
     argument descriptions.
     """
-    if options is None:
-        options = {}
     n = len(u.data)
 
     if np.any(np.isnan(problem.utility_values)):
 
@@ -7,6 +7,7 @@
 from pydvl.utils.config import ParallelConfig
 from pydvl.utils.numeric import random_powerset
 from pydvl.utils.parallel import MapReduceJob
+from pydvl.utils.parallel.backend import effective_n_jobs
 from pydvl.utils.progress import maybe_progress
 from pydvl.utils.utility import Utility
 from pydvl.value.least_core.common import LeastCoreProblem, lc_solve_problem
@@ -136,7 +137,7 @@ def mclc_prepare_problem(
         )
         n_iterations = 2**n
 
-    iterations_per_job = max(1, n_iterations // n_jobs)
+    iterations_per_job = max(1, n_iterations // effective_n_jobs(n_jobs, config))
 
     map_reduce_job: MapReduceJob["Utility", "LeastCoreProblem"] = MapReduceJob(
         inputs=u,
 
@@ -61,20 +61,23 @@ def compute_shapley_values(
       :func:`~pydvl.value.shapley.montecarlo.truncated_montecarlo_shapley`.
     - ``owen_sampling``: Uses the Owen continuous extension of the utility
       function to the unit cube. Implemented in
-      :func:`~pydvl.value.shapley.montecarlo.owen_sampling_shapley`.
-      This method requires an additional parameter `q_max` for the number of
-      subdivisions of the unit interval to use for integration.
+      :func:`~pydvl.value.shapley.montecarlo.owen_sampling_shapley`. This
+      method does not take a :class:`~pydvl.value.stopping.StoppingCriterion`
+      but instead requires a parameter ``q_max`` for the number of subdivisions
+      of the unit interval to use for integration, and another parameter
+      ``n_samples`` for the number of subsets to sample for each $q$.
     - ``owen_halved``: Same as 'owen_sampling' but uses correlated samples in the
       expectation. Implemented in
       :func:`~pydvl.value.shapley.montecarlo.owen_sampling_shapley`.
       This method  requires an additional parameter `q_max` for the number of
-      subdivisions of the interval [0,0.5] to use for integration.
+      subdivisions of the interval [0,0.5] to use for integration, and another
+       parameter ``n_samples`` for the number of subsets to sample for each $q$.
     - ``group_testing``: estimates differences of Shapley values and solves a
       constraint satisfaction problem. High sample complexity, not recommended.
-      Implemented in :func:`~pydvl.value.shapley.gt.group_testing_shapley`. Only
-      accepts :class:`~pydvl.value.stopping.MaxUpdates` (use
-      :func:`~pydvl.value.shapley.gt.num_samples_eps_delta` to compute a bound)
-      and :class:`~pydvl.value.stopping.MaxTime` as stopping criteria.
+      Implemented in :func:`~pydvl.value.shapley.gt.group_testing_shapley`. This
+      method does not take a :class:`~pydvl.value.stopping.StoppingCriterion`
+      but instead requires a parameter ``n_samples`` for the number of
+      iterations to run.
 
     Additionally, one can use model-specific methods:
 
@@ -126,8 +129,8 @@ def compute_shapley_values(
     elif mode == ShapleyMode.PermutationExact:
         return permutation_exact_shapley(u, progress=progress)
     elif mode == ShapleyMode.Owen or mode == ShapleyMode.OwenAntithetic:
-        if kwargs.get("n_iterations") is None:
-            raise ValueError("n_iterations cannot be None for Owen methods")
+        if kwargs.get("n_samples") is None:
+            raise ValueError("n_samples cannot be None for Owen methods")
         if kwargs.get("max_q") is None:
             raise ValueError("Owen Sampling requires max_q for the outer integral")
 
@@ -138,17 +141,17 @@ def compute_shapley_values(
         )
         return owen_sampling_shapley(
             u,
-            n_iterations=int(kwargs.get("n_iterations", -1)),
+            n_samples=int(kwargs.get("n_samples", -1)),
             max_q=int(kwargs.get("max_q", -1)),
             method=method,
             n_jobs=n_jobs,
         )
     elif mode == ShapleyMode.KNN:
         return knn_shapley(u, progress=progress)
     elif mode == ShapleyMode.GroupTesting:
-        n_iterations = kwargs.pop("n_iterations")
-        if n_iterations is None:
-            raise ValueError("n_iterations cannot be None for Group Testing")
+        n_samples = kwargs.pop("n_samples")
+        if n_samples is None:
+            raise ValueError("n_samples cannot be None for Group Testing")
         epsilon = kwargs.pop("epsilon")
         if epsilon is None:
             raise ValueError("Group Testing requires error bound epsilon")
@@ -157,7 +160,7 @@ def compute_shapley_values(
             u,
             epsilon=epsilon,
             delta=delta,
-            n_iterations=n_iterations,
+            n_samples=n_samples,
             n_jobs=n_jobs,
             progress=progress,
             **kwargs,