Improve performance

adrien-berchet · adrien-berchet · commit 4619d24abc49 · 2021-03-22T14:28:04.000+01:00
Change-Id: I5ab455421da440fc36c38611c920fd13bbe6a6c2
diff --git a/bluepyparallel/evaluator.py b/bluepyparallel/evaluator.py
@@ -4,6 +4,7 @@
 import traceback
 from functools import partial
 
+import pandas as pd
 from tqdm import tqdm
 
 from bluepyparallel.database import DataBase
@@ -19,7 +20,7 @@ def _try_evaluation(task, evaluation_function, func_args, func_kwargs):
         result = evaluation_function(task_args, *func_args, **func_kwargs)
         exception = None
     except Exception:  # pylint: disable=broad-except
-        result = None
+        result = {}
         exception = "".join(traceback.format_exception(*sys.exc_info()))
         logger.exception("Exception for ID=%s: %s", task_id, exception)
     return task_id, result, exception
@@ -34,6 +35,7 @@ def evaluate(
     db_url=None,
     func_args=None,
     func_kwargs=None,
+    **mapper_kwargs,
 ):
     """Evaluate and save results in a sqlite database on the fly and return dataframe.
 
@@ -54,14 +56,15 @@ def evaluate(
             SQL database.
         func_args (list): the arguments to pass to the evaluation_function.
         func_kwargs (dict): the keyword arguments to pass to the evaluation_function.
+        **mapper_kwargs: the keyword arguments are passed to the get_mapper() method of the
+            :class:`ParallelFactory` instance.
 
     Return:
         pandas.DataFrame: dataframe with new columns containing the computed results.
     """
     # Initialize the parallel factory
     if isinstance(parallel_factory, str) or parallel_factory is None:
         parallel_factory = init_parallel_factory(parallel_factory)
-
     # Set default args
     if func_args is None:
         func_args = []
@@ -74,6 +77,10 @@ def evaluate(
     to_evaluate = df.copy()
     task_ids = to_evaluate.index
 
+    if "exception" in to_evaluate.columns:
+        logger.warning("The exception column is going to be replaced")
+        to_evaluate = to_evaluate.drop(columns=["exception"])
+
     # Set default new columns
     if new_columns is None:
         new_columns = [["data", ""]]
@@ -120,7 +127,7 @@ def evaluate(
         return to_evaluate
 
     # Get the factory mapper
-    mapper = parallel_factory.get_mapper()
+    mapper = parallel_factory.get_mapper(**mapper_kwargs)
 
     # Setup the function to apply to the data
     eval_func = partial(
@@ -134,18 +141,23 @@ def evaluate(
     arg_list = list(to_evaluate.loc[task_ids, df.columns].to_dict("index").items())
 
     try:
-        for task_id, results, exception in tqdm(mapper(eval_func, arg_list), total=len(task_ids)):
+        res = []
+
+        # Collect the results
+        for task_id, result, exception in tqdm(mapper(eval_func, arg_list), total=len(task_ids)):
+            res.append(dict({"df_index": task_id, "exception": exception}, **result))
+
             # Save the results into the DB
             if db is not None:
                 db.write(
-                    task_id, results, exception, **to_evaluate.loc[task_id, df.columns].to_dict()
+                    task_id, result, exception, **to_evaluate.loc[task_id, df.columns].to_dict()
                 )
 
-            # Save the results into the DataFrame
-            if results is not None:
-                to_evaluate.loc[task_id, results.keys()] = list(results.values())
-            elif exception is not None:
-                to_evaluate.loc[task_id, "exception"] = exception
+        # Gather the results to the output DataFrame
+        res_df = pd.DataFrame(res)
+        res_df.set_index("df_index", inplace=True)
+        to_evaluate.loc[res_df.index, res_df.columns] = res_df
+
     except (KeyboardInterrupt, SystemExit) as ex:
         # To save dataframe even if program is killed
         logger.warning("Stopping mapper loop. Reason: %r", ex)
diff --git a/bluepyparallel/parallel.py b/bluepyparallel/parallel.py
@@ -33,19 +33,47 @@ class ParallelFactory:
     """Abstract class that should be subclassed to provide parallel functions."""
 
     _BATCH_SIZE = "PARALLEL_BATCH_SIZE"
+    _CHUNK_SIZE = "PARALLEL_CHUNK_SIZE"
 
-    def __init__(self, *args, batch_size=None, **kwargs):  # pylint: disable=unused-argument
+    # pylint: disable=unused-argument
+    def __init__(self, batch_size=None, chunk_size=None, **kwargs):
         self.batch_size = batch_size or int(os.getenv(self._BATCH_SIZE, "0")) or None
-        self.nb_processes = 1
         L.info("Using %s=%s", self._BATCH_SIZE, self.batch_size)
 
+        self.chunk_size = batch_size or int(os.getenv(self._CHUNK_SIZE, "0")) or None
+        L.info("Using %s=%s", self._CHUNK_SIZE, self.chunk_size)
+
+        self.nb_processes = 1
+
     @abstractmethod
-    def get_mapper(self):
+    def get_mapper(self, batch_size=None, chunk_size=None, **kwargs):
         """Return a mapper function that can be used to execute functions in parallel."""
 
     def shutdown(self):
         """Can be used to cleanup."""
 
+    def _with_batches(self, mapper, func, iterable, batch_size=None):
+        """Wrapper on mapper function creating batches of iterable to give to mapper.
+
+        The batch_size is an int corresponding to the number of evaluation in each batch/
+        """
+        if isinstance(iterable, Iterator):
+            iterable = list(iterable)
+
+        batch_size = batch_size or self.batch_size
+        if batch_size is not None:
+            iterables = np.array_split(iterable, len(iterable) // min(batch_size, len(iterable)))
+        else:
+            iterables = [iterable]
+
+        for _iterable in iterables:
+            yield from mapper(func, _iterable)
+
+    def _chunksize_to_kwargs(self, chunk_size, kwargs, label="chunk_size"):
+        chunk_size = chunk_size or self.chunk_size
+        if chunk_size is not None:
+            kwargs[label] = chunk_size
+
 
 class NoDaemonProcess(multiprocessing.Process):
     """Class that represents a non-daemon process"""
@@ -72,26 +100,10 @@ class NestedPool(Pool):  # pylint: disable=abstract-method
     Process = NoDaemonProcess
 
 
-def _with_batches(mapper, func, iterable, batch_size=None):
-    """Wrapper on mapper function creating batches of iterable to give to mapper.
-
-    The batch_size is an int corresponding to the number of evaluation in each batch/
-    """
-    if isinstance(iterable, Iterator):
-        iterable = list(iterable)
-    if batch_size is not None:
-        iterables = np.array_split(iterable, len(iterable) // min(batch_size, len(iterable)))
-    else:
-        iterables = [iterable]
-
-    for _iterable in iterables:
-        yield from mapper(func, _iterable)
-
-
 class SerialFactory(ParallelFactory):
     """Factory that do not work in parallel."""
 
-    def get_mapper(self):
+    def get_mapper(self, batch_size=None, chunk_size=None, **kwargs):
         """Get a map."""
         return map
 
@@ -101,18 +113,24 @@ class MultiprocessingFactory(ParallelFactory):
 
     _CHUNKSIZE = "PARALLEL_CHUNKSIZE"
 
-    def __init__(self, *args, processes=None, **kwargs):
+    def __init__(self, processes=None, **kwargs):
         """Initialize multiprocessing factory."""
 
-        super().__init__()
-        self.pool = NestedPool(*args, **kwargs)
+        super().__init__(**kwargs)
+
+        self.pool = NestedPool(processes=processes)
         self.nb_processes = processes or os.cpu_count()
 
-    def get_mapper(self):
+    def get_mapper(self, batch_size=None, chunk_size=None, **kwargs):
         """Get a NestedPool."""
+        self._chunksize_to_kwargs(chunk_size, kwargs)
 
         def _mapper(func, iterable):
-            return _with_batches(self.pool.imap_unordered, func, iterable, self.batch_size)
+            return self._with_batches(
+                partial(self.pool.imap_unordered, **kwargs),
+                func,
+                iterable,
+            )
 
         return _mapper
 
@@ -126,24 +144,29 @@ class IPyParallelFactory(ParallelFactory):
 
     _IPYTHON_PROFILE = "IPYTHON_PROFILE"
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, **kwargs):
         """Initialize the ipyparallel factory."""
 
-        super().__init__()
+        super().__init__(**kwargs)
         self.rc = None
         self.nb_processes = 1
 
-    def get_mapper(self):
+    def get_mapper(self, batch_size=None, chunk_size=None, **kwargs):
         """Get an ipyparallel mapper using the profile name provided."""
-        profile = os.getenv(self._IPYTHON_PROFILE, "DEFAULT_IPYTHON_PROFILE")
+        profile = os.getenv(self._IPYTHON_PROFILE, None)
         L.debug("Using %s=%s", self._IPYTHON_PROFILE, profile)
         self.rc = ipyparallel.Client(profile=profile)
         self.nb_processes = len(self.rc.ids)
         lview = self.rc.load_balanced_view()
 
+        if "ordered" not in kwargs:
+            kwargs["ordered"] = False
+
+        self._chunksize_to_kwargs(chunk_size, kwargs)
+
         def _mapper(func, iterable):
-            return _with_batches(
-                partial(lview.imap, ordered=False), func, iterable, self.batch_size
+            return self._with_batches(
+                partial(lview.imap, **kwargs), func, iterable, batch_size=batch_size
             )
 
         return _mapper
@@ -159,7 +182,7 @@ class DaskFactory(ParallelFactory):
 
     _SCHEDULER_PATH = "PARALLEL_DASK_SCHEDULER_PATH"
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, **kwargs):
         """Initialize the dask factory."""
         dask_scheduler_path = os.getenv(self._SCHEDULER_PATH)
         if dask_scheduler_path:
@@ -172,7 +195,7 @@ def __init__(self, *args, **kwargs):
             L.info("Starting dask_mpi...")
             self.client = dask.distributed.Client()
         self.nb_processes = len(self.client.scheduler_info()["workers"])
-        super().__init__()
+        super().__init__(**kwargs)
 
     def shutdown(self):
         """Retire the workers on the scheduler."""
@@ -181,16 +204,17 @@ def shutdown(self):
             self.client.retire_workers()
             self.client = None
 
-    def get_mapper(self):
+    def get_mapper(self, batch_size=None, chunk_size=None, **kwargs):
         """Get a Dask mapper."""
+        self._chunksize_to_kwargs(chunk_size, kwargs, label="batch_size")
 
         def _mapper(func, iterable):
             def _dask_mapper(func, iterable):
-                futures = self.client.map(func, iterable)
+                futures = self.client.map(func, iterable, **kwargs)
                 for _future, result in dask.distributed.as_completed(futures, with_results=True):
                     yield result
 
-            return _with_batches(_dask_mapper, func, iterable, self.batch_size)
+            return self._with_batches(_dask_mapper, func, iterable, batch_size=batch_size)
 
         return _mapper
 
diff --git a/examples/large_computation.py b/examples/large_computation.py
@@ -4,24 +4,33 @@
 import time
 from bluepyparallel import evaluate
 from bluepyparallel import init_parallel_factory
+from data_validation_framework.util import apply_to_df
 
 
 def func(row):
     """Trivial computation"""
-
-    time.sleep(1)
-
-    return {"out": row["data"] + 10}
+    if row["data"] in [1, 3]:
+        raise ValueError(f"The value {row['data']} is forbidden")
+    else:
+        return {"out": row["data"] + 10}
 
 
 if __name__ == "__main__":
-    parallel_lib = sys.argv[1]
-    import bglibpy
-
-    parallel_factory = init_parallel_factory(parallel_lib)
-    print("using ", parallel_lib)
+    parallel_lib = sys.argv[1] or None
+    batch_size = int(sys.argv[2]) if len(sys.argv) >= 3 else None
+    chunk_size = int(sys.argv[3]) if len(sys.argv) >= 4 else None
     df = pd.DataFrame()
     df["data"] = np.arange(1e6)
-    print(df)
-    df = evaluate(df, func, new_columns=[["out", 0]], parallel_factory=parallel_factory)
+
+    parallel_factory = init_parallel_factory(parallel_lib, batch_size=batch_size)
+    df = evaluate(
+        df,
+        func,
+        new_columns=[["out", 0]],
+        parallel_factory=parallel_factory,
+        chunksize=chunk_size,
+    )
     parallel_factory.shutdown()
+    print(df)
+    print(df.loc[1, "exception"])
+    print(df.loc[3, "exception"])
diff --git a/examples/run_large_dask.sh b/examples/run_large_dask.sh
@@ -16,9 +16,8 @@ module load unstable py-dask-mpi
 module load unstable py-bglibpy
 module load unstable neurodamus-neocortex
 
-deactivate
-. venv/bin/activate
+. ~/base/bin/activate
 
 unset PMI_RANK
 
-srun python large_computation.py dask
+srun python large_computation.py dask 100000 1000
diff --git a/tox.ini b/tox.ini
@@ -29,7 +29,6 @@ deps = bbp-nse-ci
 commands = do_release.py -p . check-version
 
 [testenv:lint]
-basepython=python3.6
 deps =
     {[base]testdeps}
     pycodestyle