BlueBrain
diff --git a/‎bluepyparallel/evaluator.py‎
Lines changed: 84 additions & 61 deletions b/‎bluepyparallel/evaluator.py‎
Lines changed: 84 additions & 61 deletions
diff --git a/‎bluepyparallel/parallel.py‎
Lines changed: 16 additions & 10 deletions b/‎bluepyparallel/parallel.py‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎examples/large_computation.py‎
Lines changed: 27 additions & 0 deletions b/‎examples/large_computation.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎examples/run_large_dask.sh‎
Lines changed: 24 additions & 0 deletions b/‎examples/run_large_dask.sh‎
Lines changed: 24 additions & 0 deletions
@@ -3,7 +3,6 @@
 import sqlite3
 import sys
 import traceback
-from collections import defaultdict
 from functools import partial
 from pathlib import Path
 
@@ -15,47 +14,48 @@
 logger = logging.getLogger(__name__)
 
 
-def _try_evaluation(task, evaluation_function=None):
+def _try_evaluation(task, evaluation_function, db_filename, func_args, func_kwargs):
     """Encapsulate the evaluation function into a try/except and isolate to record exceptions."""
     task_id, task_args = task
     try:
-        result = evaluation_function(task_args)
-        exception = ""
+        result = evaluation_function(task_args, *func_args, **func_kwargs)
+        exception = None
     except Exception:  # pylint: disable=broad-except
         result = None
         exception = "".join(traceback.format_exception(*sys.exc_info()))
         logger.exception("Exception for ID=%s: %s", task_id, exception)
+
+    # Save the results into the DB
+    if db_filename is not None:
+        _write_to_sql(db_filename, task_id, result, exception)
     return task_id, result, exception
 
 
-def _create_database(df, new_columns, db_filename="db.sql"):
+def _create_database(df, db_filename="db.sql"):
     """Create a sqlite database from dataframe."""
-    df["exception"] = None
-    for new_column in new_columns:
-        df[new_column[0]] = new_column[1]
-        df["to_run_" + new_column[0]] = 1
     with sqlite3.connect(str(db_filename)) as db:
-        df.to_sql("df", db, if_exists="replace", index_label="index")
-    return df
+        df.to_sql("df", db, if_exists="replace", index_label="df_index")
 
 
 def _load_database_to_dataframe(db_filename="db.sql"):
     """Load an SQL database and construct the dataframe."""
     with sqlite3.connect(str(db_filename)) as db:
-        out = pd.read_sql("SELECT * FROM df", db, index_col="index")
-        return out
+        return pd.read_sql("SELECT * FROM df", db, index_col="df_index")
 
 
-def _write_to_sql(db_filename, task_id, results, new_columns, exception):
+def _write_to_sql(db_filename, task_id, results, exception):
     """Write row data to SQL."""
     with sqlite3.connect(str(db_filename)) as db:
-        for new_column in new_columns:
-            res = results[new_column[0]] if results is not None else None
-            db.execute(
-                "UPDATE df SET " + new_column[0] + "=?, "
-                "exception=?, to_run_" + new_column[0] + "=? WHERE `index`=?",
-                (res, exception, 0, task_id),
-            )
+        if results is not None:
+            keys, vals = zip(*results.items())
+            query_keys = ", ".join([f"{k}=?" for k in keys])
+        else:
+            query_keys = "exception=?"
+            vals = [exception]
+        db.execute(
+            "UPDATE df SET " + query_keys + " WHERE df_index=?",
+            list(vals) + [task_id],
+        )
 
 
 def evaluate(
@@ -65,88 +65,111 @@ def evaluate(
     resume=False,
     parallel_factory=None,
     db_filename=None,
+    func_args=None,
+    func_kwargs=None,
 ):
     """Evaluate and save results in a sqlite database on the fly and return dataframe.
 
     Args:
-        df (DataFrame): each row contains information for the computation
+        df (DataFrame): each row contains information for the computation.
         evaluation_function (function): function used to evaluate each row,
             should have a single argument as list-like containing values of the rows of df,
-            and return a dict with keys corresponding to the names in new_columns
+            and return a dict with keys corresponding to the names in new_columns.
         new_columns (list): list of names of new column and empty value to save evaluation results,
-            i.e.: [['result', 0.0], ['valid', False]]
+            i.e.: [['result', 0.0], ['valid', False]].
         resume (bool): if True, it will use only compute the empty rows of the database,
-            if False, it will ecrase or generate the database
-        parallel_factory (ParallelFactory): parallel factory instance
+            if False, it will ecrase or generate the database.
+        parallel_factory (ParallelFactory): parallel factory instance.
         db_filename (str): if a file path is given, SQL backend will be enabled and will use this
             path for the SQLite database. Should not be used when evaluations are numerous and
             fast, in order to avoid the overhead of communication with SQL database.
+        func_args (list): the arguments to pass to the evaluation_function.
+        func_kwargs (dict): the keyword arguments to pass to the evaluation_function.
 
     Return:
-        pandas.DataFrame: dataframe with new columns containing computed results
+        pandas.DataFrame: dataframe with new columns containing the computed results.
     """
+    # Initialize the parallel factory
     if isinstance(parallel_factory, str) or parallel_factory is None:
         parallel_factory = init_parallel_factory(parallel_factory)
 
-    task_ids = df.index
+    # Set default args
+    if func_args is None:
+        func_args = []
+
+    # Set default kwargs
+    if func_kwargs is None:
+        func_kwargs = {}
 
+    # Shallow copy the given DataFrame to add internal rows
+    to_evaluate = df.copy()
+    task_ids = to_evaluate.index
+
+    # Set default new columns
     if new_columns is None:
         new_columns = [["data", ""]]
 
+    # Setup internal and new columns
+    to_evaluate["exception"] = None
+    for new_column in new_columns:
+        to_evaluate[new_column[0]] = new_column[1]
+
+    # Create the database if required and get the task ids to run
     if db_filename is None:
         logger.info("Not using SQL backend to save iterations")
-        to_evaluate = df
     elif resume:
         logger.info("Load data from SQL database")
         if Path(db_filename).exists():
-            to_evaluate = _load_database_to_dataframe(db_filename=db_filename)
-            task_ids = task_ids.intersection(to_evaluate.index)
+            previous_results = _load_database_to_dataframe(db_filename=db_filename)
+            previous_idx = previous_results.index
+            bad_cols = [
+                col
+                for col in df.columns
+                if not to_evaluate.loc[previous_idx, col].equals(previous_results[col])
+            ]
+            if bad_cols:
+                raise ValueError(
+                    f"The following columns have different values from the DataBase: {bad_cols}"
+                )
+            to_evaluate.loc[previous_results.index] = previous_results.loc[previous_results.index]
+            task_ids = task_ids.difference(previous_results.index)
         else:
-            to_evaluate = _create_database(df, new_columns, db_filename=db_filename)
-
-        # Find tasks to run
-        should_run = (
-            to_evaluate.loc[task_ids, ["to_run_" + col[0] for col in new_columns]] == 1
-        ).any(axis=1)
-        task_ids = should_run.loc[should_run].index
+            _create_database(to_evaluate, db_filename=db_filename)
     else:
         logger.info("Create SQL database")
-        to_evaluate = _create_database(df, new_columns, db_filename=db_filename)
+        _create_database(to_evaluate, db_filename=db_filename)
 
+    # Log the number of tasks to run
     if len(task_ids) > 0:
         logger.info("%s rows to compute.", str(len(task_ids)))
     else:
         logger.warning("WARNING: No row to compute, something may be wrong")
-        return _load_database_to_dataframe(db_filename)
+        return to_evaluate
 
+    # Get the factory mapper
     mapper = parallel_factory.get_mapper()
 
-    eval_func = partial(_try_evaluation, evaluation_function=evaluation_function)
-    arg_list = to_evaluate.to_dict("index").items()
+    # Setup the function to apply to the data
+    eval_func = partial(
+        _try_evaluation,
+        evaluation_function=evaluation_function,
+        db_filename=db_filename,
+        func_args=func_args,
+        func_kwargs=func_kwargs,
+    )
 
-    if db_filename is None:
-        _results = defaultdict(dict)
+    # Split the data into rows
+    arg_list = list(to_evaluate.loc[task_ids].to_dict("index").items())
 
     try:
         for task_id, results, exception in tqdm(mapper(eval_func, arg_list), total=len(task_ids)):
-            if db_filename is None:
-                for new_column, _ in new_columns:
-                    _results[new_column][task_id] = (
-                        results[new_column] if results is not None else None
-                    )
-            else:
-                _write_to_sql(
-                    db_filename,
-                    task_id,
-                    results,
-                    new_columns,
-                    exception,
-                )
+            # Save the results into the DataFrame
+            if results is not None:
+                to_evaluate.loc[task_id, results.keys()] = list(results.values())
+            elif exception is not None:
+                to_evaluate.loc[task_id, "exception"] = exception
     except (KeyboardInterrupt, SystemExit) as ex:
         # To save dataframe even if program is killed
         logger.warning("Stopping mapper loop. Reason: %r", ex)
 
-    if db_filename is None:
-        to_evaluate = pd.concat([to_evaluate, pd.DataFrame(_results)], axis=1)
-        return to_evaluate
-    return _load_database_to_dataframe(db_filename)
+    return to_evaluate
@@ -6,6 +6,7 @@
 from abc import abstractmethod
 from collections.abc import Iterator
 from functools import partial
+from multiprocessing.pool import Pool
 
 import numpy as np
 
@@ -33,8 +34,9 @@ class ParallelFactory:
 
     _BATCH_SIZE = "PARALLEL_BATCH_SIZE"
 
-    def __init__(self):
-        self.batch_size = int(os.getenv(self._BATCH_SIZE, "0")) or None
+    def __init__(self, *args, batch_size=None, **kwargs):  # pylint: disable=unused-argument
+        self.batch_size = batch_size or int(os.getenv(self._BATCH_SIZE, "0")) or None
+        self.nb_processes = 1
         L.info("Using %s=%s", self._BATCH_SIZE, self.batch_size)
 
     @abstractmethod
@@ -64,7 +66,7 @@ def _set_daemon(self, value):
     daemon = property(_get_daemon, _set_daemon)
 
 
-class NestedPool(multiprocessing.pool.Pool):  # pylint: disable=abstract-method
+class NestedPool(Pool):  # pylint: disable=abstract-method
     """Class that represents a MultiProcessing nested pool"""
 
     Process = NoDaemonProcess
@@ -78,7 +80,7 @@ def _with_batches(mapper, func, iterable, batch_size=None):
     if isinstance(iterable, Iterator):
         iterable = list(iterable)
     if batch_size is not None:
-        iterables = np.array_split(iterable, len(iterable) // batch_size)
+        iterables = np.array_split(iterable, len(iterable) // min(batch_size, len(iterable)))
     else:
         iterables = [iterable]
 
@@ -99,11 +101,12 @@ class MultiprocessingFactory(ParallelFactory):
 
     _CHUNKSIZE = "PARALLEL_CHUNKSIZE"
 
-    def __init__(self):
+    def __init__(self, *args, processes=None, **kwargs):
         """Initialize multiprocessing factory."""
 
         super().__init__()
-        self.pool = NestedPool()
+        self.pool = NestedPool(*args, **kwargs)
+        self.nb_processes = processes or os.cpu_count()
 
     def get_mapper(self):
         """Get a NestedPool."""
@@ -123,17 +126,19 @@ class IPyParallelFactory(ParallelFactory):
 
     _IPYTHON_PROFILE = "IPYTHON_PROFILE"
 
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
         """Initialize the ipyparallel factory."""
 
         super().__init__()
         self.rc = None
+        self.nb_processes = 1
 
     def get_mapper(self):
         """Get an ipyparallel mapper using the profile name provided."""
         profile = os.getenv(self._IPYTHON_PROFILE, "DEFAULT_IPYTHON_PROFILE")
         L.debug("Using %s=%s", self._IPYTHON_PROFILE, profile)
         self.rc = ipyparallel.Client(profile=profile)
+        self.nb_processes = len(self.rc.ids)
         lview = self.rc.load_balanced_view()
 
         def _mapper(func, iterable):
@@ -154,7 +159,7 @@ class DaskFactory(ParallelFactory):
 
     _SCHEDULER_PATH = "PARALLEL_DASK_SCHEDULER_PATH"
 
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
         """Initialize the dask factory."""
         dask_scheduler_path = os.getenv(self._SCHEDULER_PATH)
         if dask_scheduler_path:
@@ -166,6 +171,7 @@ def __init__(self):
             dask_mpi.initialize()
             L.info("Starting dask_mpi...")
             self.client = dask.distributed.Client()
+        self.nb_processes = len(self.client.scheduler_info()["workers"])
         super().__init__()
 
     def shutdown(self):
@@ -189,7 +195,7 @@ def _dask_mapper(func, iterable):
         return _mapper
 
 
-def init_parallel_factory(parallel_lib):
+def init_parallel_factory(parallel_lib, *args, **kwargs):
     """Return the desired instance of the parallel factory.
 
     The main factories are:
@@ -209,7 +215,7 @@ def init_parallel_factory(parallel_lib):
         parallel_factories["ipyparallel"] = IPyParallelFactory
 
     try:
-        parallel_factory = parallel_factories[parallel_lib]()
+        parallel_factory = parallel_factories[parallel_lib](*args, **kwargs)
     except KeyError:
         L.critical(
             "The %s factory is not available, maybe the required libraries are not properly "
 
@@ -0,0 +1,27 @@
+import pandas as pd
+import sys
+import numpy as np
+import time
+from bluepyparallel import evaluate
+from bluepyparallel import init_parallel_factory
+
+
+def func(row):
+    """Trivial computation"""
+
+    time.sleep(1)
+
+    return {"out": row["data"] + 10}
+
+
+if __name__ == "__main__":
+    parallel_lib = sys.argv[1]
+    import bglibpy
+
+    parallel_factory = init_parallel_factory(parallel_lib)
+    print("using ", parallel_lib)
+    df = pd.DataFrame()
+    df["data"] = np.arange(1e6)
+    print(df)
+    df = evaluate(df, func, new_columns=[["out", 0]], parallel_factory=parallel_factory)
+    parallel_factory.shutdown()
@@ -0,0 +1,24 @@
+#!/bin/bash -l
+#SBATCH --nodes=1             # Number of nodes
+#SBATCH --time=00:10:00       # Time limit
+#SBATCH --partition=prod
+#SBATCH --constraint=cpu
+#SBATCH --mem=0
+#SBATCH --cpus-per-task=1
+#SBATCH --account=proj82      # your project number
+#SBATCH --job-name=test_bpp
+set -e
+
+
+module purge
+module load unstable py-mpi4py
+module load unstable py-dask-mpi
+module load unstable py-bglibpy
+module load unstable neurodamus-neocortex
+
+deactivate
+. venv/bin/activate
+
+unset PMI_RANK
+
+srun python large_computation.py dask