Merge pull request #4 from cid-harvard/feature/hdf-to-postgres-refactor

bleonard33 · web-flow · commit c082a75d9439 · 2018-07-16T15:26:30.000-04:00
Feature/hdf to postgres refactor
diff --git a/pandas_to_postgres/__init__.py b/pandas_to_postgres/__init__.py
@@ -2,7 +2,6 @@
 from .copy_hdf import HDFTableCopy, SmallHDFTableCopy, BigHDFTableCopy
 from .hdf_to_postgres import (
     hdf_to_postgres,
-    multiprocess_hdf_to_postgres,
     create_hdf_table_objects,
 )
 from .utilities import (
diff --git a/pandas_to_postgres/hdf_to_postgres.py b/pandas_to_postgres/hdf_to_postgres.py
@@ -1,5 +1,9 @@
 from multiprocessing import Pool
-from .copy_hdf import HDFTableCopy, HDFMetadata
+
+from sqlalchemy import MetaData, create_engine
+
+from .copy_hdf import HDFTableCopy
+from .utilities import HDFMetadata
 
 
 def create_hdf_table_objects(hdf_meta, csv_chunksize=10 ** 6):
@@ -34,91 +38,95 @@ def create_hdf_table_objects(hdf_meta, csv_chunksize=10 ** 6):
     return tables
 
 
-def _copy_worker(copy_obj, defer_sql_objs=True):
-    """
-    Handle a SQLAlchemy connection and copy using HDFTableCopy object
+def _copy_worker(copy_obj, engine_args, engine_kwargs, maintenance_work_mem="1G"):
+
+    # Since we fork()ed into a new process, the engine contains process
+    # specific stuff that shouldn't be shared - this creates a fresh Engine
+    # with the same settings but without those.
+
+    engine = create_engine(*engine_args, **engine_kwargs)
+    metadata = MetaData(bind=engine)
+    metadata.reflect()
+
+    with engine.connect() as conn:
 
-    copy_obj: HDFTableCopy or subclass
-        Object to use to run the copy() method on
-    defer_sql_objs: bool
-        If True, SQL objects were not build upon instantiation of copy_obj and should
-        be built before copying data to db (needed for multiprocessing)
-    """
-    database.engine.dispose()
-    with database.engine.connect() as conn:
         conn.execution_options(autocommit=True)
-        conn.execute("SET maintenance_work_mem TO 1000000;")
 
-        if defer_sql_objs:
-            table_obj = database.metadata.tables[copy_obj.sql_table]
-            copy_obj.instantiate_sql_objs(conn, table_obj)
+        if maintenance_work_mem is not None:
+            conn.execute("SET maintenance_work_mem TO {};".format(maintenance_work_mem))
+
+        # Get SQLAlchemy Table object
+        table_obj = metadata.tables.get(copy_obj.sql_table, None)
+        if table_obj is None:
+            raise ValueError("Table {} does not exist.".format(copy_obj.sql_table))
+
+        copy_obj.instantiate_sql_objs(conn, table_obj)
 
+        # Run the task
         copy_obj.copy()
 
 
-def hdf_to_postgres(file_name, db, keys=[], csv_chunksize=10 ** 6):
+def hdf_to_postgres(file_name, engine_args, engine_kwargs={}, keys=[],
+                    csv_chunksize=10 ** 6, processes=None,
+                    maintenance_work_mem=None):
     """
     Copy tables in a HDF file to PostgreSQL database
 
     Parameters
     ----------
     file_name: str
         name of file or path to file of HDF to use to copy
-    db: SQLAlchemy database object
-        destination database
+    engine_args: list
+        arguments to pass into create_engine()
+    engine_kwargs: dict
+        keyword arguments to pass into create_engine()
     keys: list of strings
         HDF keys to copy
     csv_chunksize: int
         Maximum number of StringIO CSV rows to keep in memory at a time
+    processes: int or None
+        If None, run single threaded. If integer, number of processes in the
+        multiprocessing Pool
+    maintenance_work_mem: str or None
+        What to set postgresql's maintenance_work_mem option to: this helps
+        when rebuilding large indexes, etc.
     """
 
-    global database
-    database = db
-
     hdf = HDFMetadata(
         file_name, keys, metadata_attr="atlas_metadata", metadata_keys=["levels"]
     )
 
     tables = create_hdf_table_objects(hdf, csv_chunksize=csv_chunksize)
 
-    for table in tables:
-        _copy_worker(table, defer_sql_objs=True)
+    if processes is None:
 
+        # Single-threaded run
+        for table in tables:
+            _copy_worker(table, engine_args, engine_kwargs, maintenance_work_mem)
 
-def multiprocess_hdf_to_postgres(
-    file_name, db, keys=[], processes=4, csv_chunksize=10 ** 6
-):
-    """
-    Copy tables in a HDF file to PostgreSQL database using a multiprocessing Pool
+    elif type(processes) is int:
 
-    Parameters
-    ----------
-    file_name: str
-        Name of file or path to file of HDF to use to copy
-    db: SQLAlchemy object
-        Destination database
-    keys: list of strings
-        HDF keys to copy
-    processes: int
-        Number of processes in the Pool
-    csv_chunksize: int
-        Maximum number of StringIO CSV rows to keep in memory at a time
-    """
+        args = zip(
+            tables,
+            [engine_args] * len(tables),
+            [engine_kwargs] * len(tables),
+            [maintenance_work_mem] * len(tables)
+        )
 
-    global database
-    database = db
+        try:
+            p = Pool(processes)
+            result = p.starmap_async(_copy_worker, args, chunksize=1)
 
-    hdf = HDFMetadata(
-        file_name, keys, metadata_attr="atlas_metadata", metadata_keys=["levels"]
-    )
+        finally:
+            del tables
+            del hdf
+            p.close()
+            p.join()
 
-    tables = create_hdf_table_objects(hdf, csv_chunksize=csv_chunksize)
+        if not result.successful():
+            # If there's an exception, throw it, but we don't care about the
+            # results
+            result.get()
 
-    try:
-        p = Pool(processes)
-        p.map(_copy_worker, tables, chunksize=1)
-    finally:
-        del tables
-        del hdf
-        p.close()
-        p.join()
+    else:
+        raise ValueError("processes should be int or None.")

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,6 @@`
`2`	`2`	`from .copy_hdf import HDFTableCopy, SmallHDFTableCopy, BigHDFTableCopy`
`3`	`3`	`from .hdf_to_postgres import (`
`4`	`4`	`hdf_to_postgres,`
`5`		`- multiprocess_hdf_to_postgres,`
`6`	`5`	`create_hdf_table_objects,`
`7`	`6`	`)`
`8`	`7`	`from .utilities import (`