Merge branch 'develop' into refactor/tmcs

mdbenito · web-flow · commit 7e1e2161ea0f · 2023-02-20T18:02:42.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,10 +5,15 @@
 - Generalised stopping criteria for valuation algorithms. Improved classes
   `ValuationResult` and `Status` with more operations. Some minor issues fixed.
   [PR #252](https://github.com/appliedAI-Initiative/pyDVL/pull/250)
-- Splitting of problem preparation and solution in Least Core computation.
+- Fixed a bug whereby `compute_shapley_values` would only spawn one process when
+  using `n_jobs=-1` and Monte Carlo methods.
+  [PR #270](https://github.com/appliedAI-Initiative/pyDVL/pull/270)
+- Bugfix in `RayParallelBackend`: wrong semantics for `kwargs`.
+  [PR #268](https://github.com/appliedAI-Initiative/pyDVL/pull/268)
+- Splitting of problem preparation and solution in Least-Core computation.
   Umbrella function for LC methods.
-  [PR #257](https://github.com/appliedAI-Initiative/pyDVL/pull/257)
-- Operations on ValuationResults and Statuses and cleanup
+  [PR #257](https://github.com/appliedAI-Initiative/pyDVL/pull/257) 
+- Operations on `ValuationResult` and `Status` and some cleanup
   [PR #248](https://github.com/appliedAI-Initiative/pyDVL/pull/248)
 - **Bug fix and minor improvements**: Fixes bug in TMCS with remote Ray cluster,
   raises an error for dummy sequential parallel backend with TMCS, clones model
@@ -28,8 +33,8 @@
 - **Breaking change:** Introduces a class ValuationResult to gather and inspect
   results from all valuation algorithms
   [PR #214](https://github.com/appliedAI-Initiative/pyDVL/pull/214)
-- Fixes bug in Influence calculation with multi-dimensional input and adds
-  new example notebook
+- Fixes bug in Influence calculation with multidimensional input and adds new
+  example notebook
   [PR #195](https://github.com/appliedAI-Initiative/pyDVL/pull/195)
 - **Breaking change**: Passes the input to `MapReduceJob` at initialization,
   removes `chunkify_inputs` argument from `MapReduceJob`, removes `n_runs`
diff --git a/src/pydvl/utils/dataset.py b/src/pydvl/utils/dataset.py
@@ -524,7 +524,7 @@ def load_spotify_dataset(
     if file_path.exists():
         data = pd.read_csv(file_path)
     else:
-        url = "https://github.com/appliedAI-Initiative/pyDVL/blob/develop/data/top_hits_spotify_dataset.csv"
+        url = "https://raw.githubusercontent.com/appliedAI-Initiative/pyDVL/develop/data/top_hits_spotify_dataset.csv"
         data = pd.read_csv(url)
         data.to_csv(file_path, index=False)
 
diff --git a/src/pydvl/utils/parallel/actor.py b/src/pydvl/utils/parallel/actor.py
@@ -41,7 +41,7 @@ class RayActorWrapper:
     def __init__(self, actor_class: Type, config: ParallelConfig, *args, **kwargs):
         parallel_backend = cast(RayParallelBackend, init_parallel_backend(config))
         remote_cls = parallel_backend.wrap(actor_class)
-        self.actor_handle = remote_cls.remote(*args, **kwargs)
+        self.actor_handle = remote_cls(*args, **kwargs)
 
         def remote_caller(method_name: str):
             # Wrapper for remote class' methods to mimic local calls
diff --git a/src/pydvl/utils/parallel/backend.py b/src/pydvl/utils/parallel/backend.py
@@ -1,18 +1,26 @@
-import functools
 import os
 from abc import ABCMeta, abstractmethod
 from dataclasses import asdict
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, TypeVar, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
 
 import ray
 from ray import ObjectRef
 from ray.remote_function import RemoteFunction
 
 from ..config import ParallelConfig
 
-__all__ = [
-    "init_parallel_backend",
-]
+__all__ = ["init_parallel_backend", "effective_n_jobs", "available_cpus"]
 
 T = TypeVar("T")
 
@@ -63,7 +71,7 @@ def put(self, v: Any, *args, **kwargs) -> Any:
         ...
 
     @abstractmethod
-    def wrap(self, *args, **kwargs) -> Any:
+    def wrap(self, fun: Callable, **kwargs) -> Callable:
         ...
 
     @abstractmethod
@@ -104,9 +112,11 @@ def get(self, v: Any, *args, **kwargs):
     def put(self, v: Any, *args, **kwargs) -> Any:
         return v
 
-    def wrap(self, *args, **kwargs) -> Any:
-        assert len(args) == 1
-        return functools.partial(args[0], **kwargs)
+    def wrap(self, fun: Callable, **kwargs) -> Callable:
+        """Wraps a function for sequential execution.
+
+        This is a noop and kwargs are ignored."""
+        return fun
 
     def wait(self, v: Any, *args, **kwargs) -> Tuple[list, list]:
         return v, []
@@ -151,8 +161,17 @@ def put(self, v: T, *args, **kwargs) -> Union["ObjectRef[T]", T]:
         except TypeError:
             return v  # type: ignore
 
-    def wrap(self, *args, **kwargs) -> RemoteFunction:
-        return ray.remote(*args, **kwargs)  # type: ignore
+    def wrap(self, fun: Callable, **kwargs) -> Callable:
+        """Wraps a function as a ray remote.
+
+        :param fun: the function to wrap
+        :param kwargs: keyword arguments to pass to @ray.remote
+
+        :return: The `.remote` method of the ray `RemoteFunction`.
+        """
+        if len(kwargs) > 1:
+            return ray.remote(**kwargs)(fun).remote  # type: ignore
+        return ray.remote(fun).remote  # type: ignore
 
     def wait(
         self,
@@ -213,3 +232,25 @@ def available_cpus() -> int:
     if system() != "Linux":
         return os.cpu_count() or 1
     return len(os.sched_getaffinity(0))
+
+
+def effective_n_jobs(n_jobs: int, config: ParallelConfig = ParallelConfig()) -> int:
+    """Returns the effective number of jobs.
+
+    This number may vary depending on the parallel backend and the resources
+    available.
+
+    :param n_jobs: the number of jobs requested. If -1, the number of available
+        CPUs is returned.
+    :param config: instance of :class:`~pydvl.utils.config.ParallelConfig` with
+        cluster address, number of cpus, etc.
+    :return: the effective number of jobs, guaranteed to be >= 1.
+    :raises RuntimeError: if the effective number of jobs returned by the backend
+        is < 1.
+    """
+    parallel_backend = init_parallel_backend(config)
+    if (eff_n_jobs := parallel_backend.effective_n_jobs(n_jobs)) < 1:
+        raise RuntimeError(
+            f"Invalid number of jobs {eff_n_jobs} obtained from parallel backend {config.backend}"
+        )
+    return eff_n_jobs
diff --git a/src/pydvl/utils/parallel/map_reduce.py b/src/pydvl/utils/parallel/map_reduce.py
@@ -219,10 +219,19 @@ def reduce(self, chunks: List["ObjectRef[R]"]) -> R:
         return result  # type: ignore
 
     def _wrap_function(self, func: Callable, **kwargs) -> Callable:
-        remote_func = self.parallel_backend.wrap(
+        """Wraps a function with a timeout and remote arguments and puts it on
+        the remote backend.
+
+        :param func: Function to wrap
+        :param kwargs: Additional keyword arguments to pass to the backend
+            wrapper. These are *not* arguments for the wrapped function.
+        :return: Remote function that can be called with the same arguments as
+            the wrapped function. Depending on the backend, this may simply be
+            the function itself.
+        """
+        return self.parallel_backend.wrap(
             _wrap_func_with_remote_args(func, timeout=self.timeout), **kwargs
         )
-        return getattr(remote_func, "remote", remote_func)  # type: ignore
 
     def _backpressure(
         self, jobs: List[ObjectRef], n_dispatched: int, n_finished: int
diff --git a/src/pydvl/value/least_core/montecarlo.py b/src/pydvl/value/least_core/montecarlo.py
@@ -7,6 +7,7 @@
 from pydvl.utils.config import ParallelConfig
 from pydvl.utils.numeric import random_powerset
 from pydvl.utils.parallel import MapReduceJob
+from pydvl.utils.parallel.backend import effective_n_jobs
 from pydvl.utils.progress import maybe_progress
 from pydvl.utils.utility import Utility
 from pydvl.value.least_core.common import LeastCoreProblem, lc_solve_problem
@@ -136,7 +137,7 @@ def mclc_prepare_problem(
         )
         n_iterations = 2**n
 
-    iterations_per_job = max(1, n_iterations // n_jobs)
+    iterations_per_job = max(1, n_iterations // effective_n_jobs(n_jobs, config))
 
     map_reduce_job: MapReduceJob["Utility", "LeastCoreProblem"] = MapReduceJob(
         inputs=u,
diff --git a/src/pydvl/value/shapley/gt.py b/src/pydvl/value/shapley/gt.py
@@ -25,6 +25,7 @@
 
 from pydvl.utils import MapReduceJob, ParallelConfig, Utility, maybe_progress
 from pydvl.utils.numeric import random_subset_of_size
+from pydvl.utils.parallel.backend import effective_n_jobs
 from pydvl.utils.status import Status
 from pydvl.value import ValuationResult
 
@@ -237,7 +238,7 @@ def group_testing_shapley(
             f"ε={eps:.02f} guarantee at .95 probability"
         )
 
-    iterations_per_job = max(1, n_iterations // n_jobs)
+    iterations_per_job = max(1, n_iterations // effective_n_jobs(n_jobs, config))
 
     def reducer(
         results_it: Iterable[Tuple[NDArray, NDArray]]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -189,7 +189,7 @@ def seed_numpy(seed=42):
     np.random.seed(seed)
 
 
-@pytest.fixture
+@pytest.fixture(scope="session")
 def num_workers():
     return max(1, available_cpus() - 1)
 
diff --git a/tests/utils/conftest.py b/tests/utils/conftest.py
@@ -6,7 +6,7 @@
 
 
 @pytest.fixture(scope="module", params=["sequential", "ray-local", "ray-external"])
-def parallel_config(request):
+def parallel_config(request, num_workers):
     if request.param == "sequential":
         yield ParallelConfig(backend=request.param)
     elif request.param == "ray-local":
@@ -17,7 +17,7 @@ def parallel_config(request):
         cluster = Cluster(
             initialize_head=True,
             head_node_args={
-                "num_cpus": 4,
+                "num_cpus": num_workers,
             },
         )
         yield ParallelConfig(backend="ray", address=cluster.address)
diff --git a/tests/utils/test_parallel.py b/tests/utils/test_parallel.py
@@ -1,15 +1,17 @@
 import operator
+import os
+import time
 from functools import partial, reduce
 
 import numpy as np
 import pytest
 
 from pydvl.utils.parallel import MapReduceJob, init_parallel_backend
-from pydvl.utils.parallel.backend import available_cpus
+from pydvl.utils.parallel.backend import available_cpus, effective_n_jobs
 from pydvl.utils.parallel.map_reduce import _get_value
 
 
-def test_effective_n_jobs(parallel_config):
+def test_effective_n_jobs(parallel_config, num_workers):
     parallel_backend = init_parallel_backend(parallel_config)
     if parallel_config.backend == "sequential":
         assert parallel_backend.effective_n_jobs(1) == 1
@@ -21,7 +23,16 @@ def test_effective_n_jobs(parallel_config):
         if parallel_config.address is None:
             assert parallel_backend.effective_n_jobs(-1) == available_cpus()
         else:
-            assert parallel_backend.effective_n_jobs(-1) == 4
+            assert parallel_backend.effective_n_jobs(-1) == num_workers
+
+    for n_jobs in [-1, 1, 2]:
+        assert parallel_backend.effective_n_jobs(n_jobs) == effective_n_jobs(
+            n_jobs, parallel_config
+        )
+        assert effective_n_jobs(n_jobs, parallel_config) > 0
+
+    with pytest.raises(ValueError):
+        parallel_backend.effective_n_jobs(0)
 
 
 @pytest.fixture()
@@ -188,3 +199,27 @@ def test_map_reduce_get_value(x, expected_x, parallel_config):
     parallel_backend = init_parallel_backend(parallel_config)
     x_id = parallel_backend.put(x)
     assert np.all(_get_value(x_id) == expected_x)
+
+
+def test_wrap_function(parallel_config, num_workers):
+    def fun(x, **kwargs):
+        return dict(x=x * x, **kwargs)
+
+    parallel_backend = init_parallel_backend(parallel_config)
+    # Try two kwargs for @ray.remote. Should be ignored in the sequential backend
+    wrapped_func = parallel_backend.wrap(fun, num_cpus=1, max_calls=1)
+    x = parallel_backend.put(2)
+    ret = parallel_backend.get(wrapped_func(x))
+
+    assert ret["x"] == 4
+    assert len(ret) == 1  # Ensure that kwargs are not passed to the function
+
+    if parallel_config.backend != "sequential":
+        # Test that the function is executed in different processes
+        def get_pid():
+            time.sleep(2)  # FIXME: waiting less means fewer processes are used?!
+            return os.getpid()
+
+        wrapped_func = parallel_backend.wrap(get_pid, num_cpus=1)
+        pids = parallel_backend.get([wrapped_func() for _ in range(num_workers)])
+        assert len(set(pids)) == num_workers