Improve DB inserts for dask_dataframe factory

adrien-berchet · adrien-berchet · commit d4a6767a1efe · 2021-06-21T14:46:21.000+02:00
Change-Id: I7d696a45df6818d14a77dc4ae0d0db0fa844f88b
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,10 +1,20 @@
 Changelog
 =========
 
+Version 0.0.6
+-------------
+
+- Improve DB inserts for dask.dataframe factory
+
+Version 0.0.5
+-------------
+
+- Add support for dask.dataframe
+
 Version 0.0.4
 -------------
 
-- Added DaskDataframe factory
+- Update doc, README and author
 
 Version 0.0.3
 -------------
diff --git a/bluepyparallel/database.py b/bluepyparallel/database.py
@@ -4,6 +4,7 @@
 import pandas as pd
 from sqlalchemy import MetaData
 from sqlalchemy import Table
+from sqlalchemy import bindparam
 from sqlalchemy import create_engine
 from sqlalchemy import insert
 from sqlalchemy import schema
@@ -15,6 +16,7 @@
 
 try:  # pragma: no cover
     import psycopg2
+    import psycopg2.extras
 
     with_psycopg2 = True
 except ImportError:
@@ -126,3 +128,23 @@ def write(self, row_id, result=None, exception=None, **input_values):
 
         query = insert(self.table).values(dict(**{self.index_col: row_id}, **vals, **input_values))
         self.connection.execute(query)
+
+    def write_batch(self, columns, data):
+        """Write entries from a list of lists into the table."""
+        if not data:  # pragma: no cover
+            return
+        assert len(columns) + 1 == len(
+            data[0]
+        ), "The columns list must have one less entry than each data element"
+        cursor = self.connection.connection.cursor()
+        cols = {col: bindparam(col) for col in [self.index_col] + columns}
+        # pylint: disable=no-value-for-parameter
+        compiled = self.table.insert().values(**cols).compile(dialect=self.engine.dialect)
+
+        if hasattr(cursor, "mogrify") and with_psycopg2:  # pragma: no cover
+            psycopg2.extras.execute_values(cursor, str(compiled), data)
+        else:
+            cursor.executemany(str(compiled), data)
+
+        self.connection.connection.commit()
+        self.connection.connection.close()
diff --git a/bluepyparallel/evaluator.py b/bluepyparallel/evaluator.py
@@ -42,7 +42,15 @@ def _try_evaluation_df(task, evaluation_function, func_args, func_kwargs):
 
 
 def _evaluate_dataframe(
-    to_evaluate, df, evaluation_function, func_args, func_kwargs, new_columns, mapper, task_ids, db
+    to_evaluate,
+    input_cols,
+    evaluation_function,
+    func_args,
+    func_kwargs,
+    new_columns,
+    mapper,
+    task_ids,
+    db,
 ):
     """Internal evalution function for dask.dataframe."""
     # Setup the function to apply to the data
@@ -57,25 +65,21 @@ def _evaluate_dataframe(
     res = []
     try:
         # Compute and collect the results
-        for batch in mapper(eval_func, to_evaluate.loc[task_ids, df.columns], meta=meta):
+        for batch in mapper(eval_func, to_evaluate.loc[task_ids, input_cols], meta=meta):
             res.append(batch)
 
             if db is not None:
-                # pylint: disable=cell-var-from-loop
-                batch_complete = to_evaluate[df.columns].join(batch, how="right")
-                batch_cols = [col for col in batch_complete.columns if col != "exception"]
-                batch_complete.apply(
-                    lambda row: db.write(row.name, row[batch_cols].to_dict(), row["exception"]),
-                    axis=1,
-                )
+                batch_complete = to_evaluate[input_cols].join(batch, how="right")
+                data = batch_complete.to_records().tolist()
+                db.write_batch(batch_complete.columns.tolist(), data)
     except (KeyboardInterrupt, SystemExit) as ex:  # pragma: no cover
         # To save dataframe even if program is killed
         logger.warning("Stopping mapper loop. Reason: %r", ex)
     return pd.concat(res)
 
 
 def _evaluate_basic(
-    to_evaluate, df, evaluation_function, func_args, func_kwargs, mapper, task_ids, db
+    to_evaluate, input_cols, evaluation_function, func_args, func_kwargs, mapper, task_ids, db
 ):
 
     res = []
@@ -88,7 +92,7 @@ def _evaluate_basic(
     )
 
     # Split the data into rows
-    arg_list = list(to_evaluate.loc[task_ids, df.columns].to_dict("index").items())
+    arg_list = list(to_evaluate.loc[task_ids, input_cols].to_dict("index").items())
 
     try:
         # Compute and collect the results
@@ -98,7 +102,7 @@ def _evaluate_basic(
             # Save the results into the DB
             if db is not None:
                 db.write(
-                    task_id, result, exception, **to_evaluate.loc[task_id, df.columns].to_dict()
+                    task_id, result, exception, **to_evaluate.loc[task_id, input_cols].to_dict()
                 )
     except (KeyboardInterrupt, SystemExit) as ex:
         # To save dataframe even if program is killed
@@ -132,7 +136,7 @@ def _prepare_db(db_url, to_evaluate, df, resume, task_ids):
         logger.info("Create SQL database")
         db.create(to_evaluate)
 
-    return db, db.get_url()
+    return db, db.get_url(), task_ids
 
 
 def evaluate(
@@ -209,7 +213,7 @@ def evaluate(
         logger.info("Not using SQL backend to save iterations")
         db = None
     else:
-        db, db_url = _prepare_db(db_url, to_evaluate, df, resume, task_ids)
+        db, db_url, task_ids = _prepare_db(db_url, to_evaluate, df, resume, task_ids)
 
     # Log the number of tasks to run
     if len(task_ids) > 0:
@@ -224,7 +228,7 @@ def evaluate(
     if isinstance(parallel_factory, DaskDataFrameFactory):
         res_df = _evaluate_dataframe(
             to_evaluate,
-            df,
+            df.columns,
             evaluation_function,
             func_args,
             func_kwargs,
@@ -236,7 +240,7 @@ def evaluate(
     else:
         res_df = _evaluate_basic(
             to_evaluate,
-            df,
+            df.columns,
             evaluation_function,
             func_args,
             func_kwargs,
diff --git a/bluepyparallel/parallel.py b/bluepyparallel/parallel.py
@@ -90,7 +90,9 @@ def _with_batches(self, mapper, func, iterable, batch_size=None):
         else:
             iterables = [iterable]
 
-        for _iterable in iterables:
+        for i, _iterable in enumerate(iterables):
+            if len(iterables) > 1:
+                L.info("Computing batch %s / %s", i + 1, len(iterables))
             yield from mapper(func, _iterable)
 
     def _chunksize_to_kwargs(self, chunk_size, kwargs, label="chunk_size"):
diff --git a/bluepyparallel/version.py b/bluepyparallel/version.py
@@ -1,3 +1,3 @@
 """Package version"""
 # pragma: no cover
-VERSION = "0.0.5"
+VERSION = "0.0.6.dev0"
diff --git a/setup.py b/setup.py
@@ -23,6 +23,7 @@
 ]
 
 doc_reqs = [
+    "sphinx<4",
     "sphinx-bluebrain-theme",
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@`
`23`	`23`	`]`
`24`	`24`
`25`	`25`	`doc_reqs = [`
	`26`	`+ "sphinx<4",`
`26`	`27`	`"sphinx-bluebrain-theme",`
`27`	`28`	`]`
`28`	`29`