Add DB API using SQLAlchemy

adrien-berchet · adrien-berchet · commit c94dc94d16cc · 2021-03-19T17:53:45.000+01:00
Change-Id: I3a5b0bc8888ffcb4faab01b1f4d80c032a70f54c
diff --git a/bluepyparallel/database.py b/bluepyparallel/database.py
@@ -0,0 +1,82 @@
+"""Module"""
+import re
+
+import pandas as pd
+from sqlalchemy import MetaData
+from sqlalchemy import Table
+from sqlalchemy import create_engine
+from sqlalchemy import insert
+from sqlalchemy import schema
+from sqlalchemy import select
+from sqlalchemy.engine.reflection import Inspector
+from sqlalchemy_utils import create_database
+from sqlalchemy_utils import database_exists
+
+
+class DataBase:
+    """A database API using SQLAlchemy."""
+
+    index_col = "df_index"
+    _url_pattern = r"[a-zA-Z0-9_\-\+]+://.*"
+
+    def __init__(self, url, *args, create=False, **kwargs):
+        if not re.match(self._url_pattern, str(url)):
+            url = "sqlite:///" + str(url)
+
+        self.engine = create_engine(url, *args, **kwargs)
+
+        if create and not database_exists(self.engine.url):
+            create_database(self.engine.url)
+
+        self.connection = self.engine.connect()
+        self.metadata = None
+        self.table = None
+
+    def get_url(self):
+        return self.engine.url
+
+    def create(self, df, table_name=None, schema_name=None):
+        if table_name is None:
+            table_name = "df"
+        if schema_name is not None and schema_name not in self.connection.dialect.get_schema_names(
+            self.connection
+        ):
+            self.connection.execute(schema.CreateSchema(schema_name))
+        new_df = df.loc[[]]
+        new_df.to_sql(
+            name=table_name,
+            con=self.connection,
+            schema=schema_name,
+            if_exists="replace",
+            index_label=self.index_col,
+        )
+        self.reflect(table_name, schema_name)
+
+    def exists(self, table_name, schema_name=None):
+        inspector = Inspector.from_engine(self.engine)
+        return table_name in inspector.get_table_names(schema=schema_name)
+
+    def reflect(self, table_name, schema_name=None):
+        self.metadata = MetaData()
+        self.table = Table(
+            table_name,
+            self.metadata,
+            schema=schema_name,
+            autoload=True,
+            autoload_with=self.engine,
+        )
+
+    def load(self):
+        query = select([self.table])
+        return pd.read_sql(query, self.connection, index_col=self.index_col)
+
+    def write(self, row_id, result=None, exception=None, **input_values):
+        if result is not None:
+            vals = result
+        elif exception is not None:
+            vals = {"exception": exception}
+        else:
+            return
+
+        query = insert(self.table).values(dict(**{self.index_col: row_id}, **vals, **input_values))
+        self.connection.execute(query)
diff --git a/bluepyparallel/evaluator.py b/bluepyparallel/evaluator.py
@@ -1,20 +1,18 @@
 """Module to evaluate generic functions on rows of dataframe."""
 import logging
-import sqlite3
 import sys
 import traceback
 from functools import partial
-from pathlib import Path
 
-import pandas as pd
 from tqdm import tqdm
 
+from bluepyparallel.database import DataBase
 from bluepyparallel.parallel import init_parallel_factory
 
 logger = logging.getLogger(__name__)
 
 
-def _try_evaluation(task, evaluation_function, db_filename, func_args, func_kwargs):
+def _try_evaluation(task, evaluation_function, func_args, func_kwargs):
     """Encapsulate the evaluation function into a try/except and isolate to record exceptions."""
     task_id, task_args = task
     try:
@@ -24,47 +22,16 @@ def _try_evaluation(task, evaluation_function, db_filename, func_args, func_kwar
         result = None
         exception = "".join(traceback.format_exception(*sys.exc_info()))
         logger.exception("Exception for ID=%s: %s", task_id, exception)
-
-    # Save the results into the DB
-    if db_filename is not None:
-        _write_to_sql(db_filename, task_id, result, exception)
     return task_id, result, exception
 
 
-def _create_database(df, db_filename="db.sql"):
-    """Create a sqlite database from dataframe."""
-    with sqlite3.connect(str(db_filename)) as db:
-        df.to_sql("df", db, if_exists="replace", index_label="df_index")
-
-
-def _load_database_to_dataframe(db_filename="db.sql"):
-    """Load an SQL database and construct the dataframe."""
-    with sqlite3.connect(str(db_filename)) as db:
-        return pd.read_sql("SELECT * FROM df", db, index_col="df_index")
-
-
-def _write_to_sql(db_filename, task_id, results, exception):
-    """Write row data to SQL."""
-    with sqlite3.connect(str(db_filename)) as db:
-        if results is not None:
-            keys, vals = zip(*results.items())
-            query_keys = ", ".join([f"{k}=?" for k in keys])
-        else:
-            query_keys = "exception=?"
-            vals = [exception]
-        db.execute(
-            "UPDATE df SET " + query_keys + " WHERE df_index=?",
-            list(vals) + [task_id],
-        )
-
-
 def evaluate(
     df,
     evaluation_function,
     new_columns=None,
     resume=False,
     parallel_factory=None,
-    db_filename=None,
+    db_url=None,
     func_args=None,
     func_kwargs=None,
 ):
@@ -80,9 +47,11 @@ def evaluate(
         resume (bool): if True, it will use only compute the empty rows of the database,
             if False, it will ecrase or generate the database.
         parallel_factory (ParallelFactory): parallel factory instance.
-        db_filename (str): if a file path is given, SQL backend will be enabled and will use this
-            path for the SQLite database. Should not be used when evaluations are numerous and
-            fast, in order to avoid the overhead of communication with SQL database.
+        db_url (str): should be DB URL that can be interpreted by SQLAlchemy or can be a file path
+            that is interpreted as a SQLite database. If an URL is given, the SQL backend will be
+            enabled to store results and allowing future resume. Should not be used when
+            evaluations are numerous and fast, in order to avoid the overhead of communication with
+            SQL database.
         func_args (list): the arguments to pass to the evaluation_function.
         func_kwargs (dict): the keyword arguments to pass to the evaluation_function.
 
@@ -115,12 +84,16 @@ def evaluate(
         to_evaluate[new_column[0]] = new_column[1]
 
     # Create the database if required and get the task ids to run
-    if db_filename is None:
+    if db_url is None:
         logger.info("Not using SQL backend to save iterations")
-    elif resume:
-        logger.info("Load data from SQL database")
-        if Path(db_filename).exists():
-            previous_results = _load_database_to_dataframe(db_filename=db_filename)
+        db = None
+    else:
+        db = DataBase(db_url)
+
+        if resume and db.exists("df"):
+            logger.info("Load data from SQL database")
+            db.reflect("df")
+            previous_results = db.load()
             previous_idx = previous_results.index
             bad_cols = [
                 col
@@ -134,10 +107,10 @@ def evaluate(
             to_evaluate.loc[previous_results.index] = previous_results.loc[previous_results.index]
             task_ids = task_ids.difference(previous_results.index)
         else:
-            _create_database(to_evaluate, db_filename=db_filename)
-    else:
-        logger.info("Create SQL database")
-        _create_database(to_evaluate, db_filename=db_filename)
+            logger.info("Create SQL database")
+            db.create(to_evaluate)
+
+        db_url = db.get_url()
 
     # Log the number of tasks to run
     if len(task_ids) > 0:
@@ -153,16 +126,21 @@ def evaluate(
     eval_func = partial(
         _try_evaluation,
         evaluation_function=evaluation_function,
-        db_filename=db_filename,
         func_args=func_args,
         func_kwargs=func_kwargs,
     )
 
     # Split the data into rows
-    arg_list = list(to_evaluate.loc[task_ids].to_dict("index").items())
+    arg_list = list(to_evaluate.loc[task_ids, df.columns].to_dict("index").items())
 
     try:
         for task_id, results, exception in tqdm(mapper(eval_func, arg_list), total=len(task_ids)):
+            # Save the results into the DB
+            if db is not None:
+                db.write(
+                    task_id, results, exception, **to_evaluate.loc[task_id, df.columns].to_dict()
+                )
+
             # Save the results into the DataFrame
             if results is not None:
                 to_evaluate.loc[task_id, results.keys()] = list(results.values())
diff --git a/setup.py b/setup.py
@@ -12,6 +12,16 @@
 with open("README.rst", encoding="utf-8") as f:
     README = f.read()
 
+reqs = [
+    "pandas",
+    "ipyparallel",
+    "dask[distributed]>=2.30",
+    "dask-mpi>=2.20",
+    "sqlalchemy<1.4",
+    "sqlalchemy-utils",
+    "tqdm",
+]
+
 doc_reqs = [
     "sphinx-bluebrain-theme",
 ]
@@ -32,13 +42,7 @@
         "Source": "ssh://bbpcode.epfl.ch/cells/BluePyParallel",
     },
     license="BBP-internal-confidential",
-    install_requires=[
-        "pandas",
-        "ipyparallel",
-        "dask[distributed]>=2.30",
-        "dask-mpi>=2.20",
-        "tqdm",
-    ],
+    install_requires=reqs,
     extras_require={
         "docs": doc_reqs,
     },
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,5 +3,5 @@
 
 
 @pytest.fixture
-def db_filename(tmpdir):
+def db_url(tmpdir):
     return tmpdir / "db.sql"
diff --git a/tests/test_database.py b/tests/test_database.py
@@ -0,0 +1,96 @@
+"""Test the bluepyparallel.evaluator module"""
+# pylint: disable=redefined-outer-name
+import pandas as pd
+import pytest
+from sqlalchemy import MetaData
+from sqlalchemy import Table
+from sqlalchemy import create_engine
+from sqlalchemy import select
+
+from bluepyparallel import database
+
+URLS = ["/tmpdir/test.db", "sqlite:////tmpdir/test.db"]
+
+
+@pytest.fixture(params=URLS)
+def url(request, tmpdir):
+    return request.param.replace("/tmpdir", str(tmpdir))
+
+
+@pytest.fixture
+def small_df():
+    data = {"a": list(range(6)), "b": [str(i * 10) for i in range(6)], "exception": [None] * 6}
+    idx = [f"idx_{(i + 1) * 2}" for i in range(6)]
+    return pd.DataFrame(data, index=idx)
+
+
+@pytest.fixture()
+def small_db(url, small_df):
+    db = database.DataBase(url)
+    db.create(small_df)
+    small_df.to_sql(
+        name=db.table.name,
+        con=db.connection,
+        schema=db.table.schema,
+        if_exists="replace",
+        index_label=db.index_col,
+    )
+    return db
+
+
+class TestDataBase:
+    """Test the DataBase class."""
+
+    @pytest.mark.parametrize("table_name", [None, "df", "df_name"])
+    @pytest.mark.parametrize("schema_name", [None])
+    def test_create(self, url, small_df, table_name, schema_name):
+        db = database.DataBase(url)
+        db.create(small_df, table_name, schema_name)
+
+        # Check DB
+        if url.startswith("/"):
+            url = "sqlite:///" + url
+        engine = create_engine(url)
+        conn = engine.connect()
+        metadata = MetaData()
+        table = Table(
+            table_name or "df",
+            metadata,
+            schema=schema_name,
+            autoload=True,
+            autoload_with=engine,
+        )
+
+        # Check reflected table
+        assert str(table.c.items()) == str(db.table.c.items())
+
+        # Check elements inserted into the DB
+        query = select([table])
+        res = conn.execute(query).fetchall()
+        assert res == []
+
+    def test_exists(self, small_db):
+        assert small_db.exists("df")
+        assert not small_db.exists("UNKNOWN TABLE")
+
+    def test_load(self, small_df, small_db):
+        res = small_db.load()
+
+        # Check DB
+        assert res.equals(small_df)
+
+    def test_write(self, small_df, small_db):
+        small_db.write("idx_100", result={"a": 1, "b": "test_1"})
+        small_db.write("idx_101", exception="test exception")
+        small_db.write("idx_102")  # Should write nothing
+
+        # Check DB after write
+        res = small_db.load()
+        small_df.loc["idx_100", ["a", "b", "exception"]] = [1, "test_1", None]
+        small_df.loc["idx_101", ["a", "b", "exception"]] = [None, None, "test exception"]
+        assert res.equals(small_df)
+
+    def test_get_url(self, url, small_db):
+        if url.startswith("/"):
+            url = "sqlite:///" + url
+        assert str(small_db.get_url()) == url
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py