Move CID-specific data corralling functions to atlas_core from pandas-to-postgres

bleonard33 · bleonard33 · commit d94dd80811d6 · 2018-07-13T14:04:44.000-04:00
diff --git a/pandas_to_postgres/__init__.py b/pandas_to_postgres/__init__.py
@@ -0,0 +1,9 @@
+from .copy_df import DataFrameCopy
+from .copy_hdf import HDFTableCopy, ClassificationHDFTableCopy, BigHDFTableCopy
+from .utilities import (
+    logger,
+    HDFMetadata,
+    create_file_object,
+    df_generator,
+    cast_pandas,
+)
diff --git a/pandas_to_postgres/_base_copy.py b/pandas_to_postgres/_base_copy.py
@@ -1,5 +1,7 @@
 from .utilities import logger
 from io import StringIO
+from pandas import DataFrame
+from typing import Callable, List
 from sqlalchemy.schema import AddConstraint, DropConstraint
 from sqlalchemy.exc import SQLAlchemyError
 from sqlalchemy.sql.schema import Table
@@ -43,6 +45,11 @@ def instantiate_sql_objs(self, conn, table_obj):
         """
         When using multiprocessing, pickling of SQLAlchemy objects in __init__ causes
         issues, so allow for deferring until after the pickling to fetch SQLAlchemy objs
+
+        Parameters
+        ----------
+        conn: SQLAlchemy connection managed outside of the object
+        table_obj: SQLAlchemy object for the destination SQL Table
         """
         self.conn = conn
         self.table_obj = table_obj
@@ -97,8 +104,19 @@ def analyze(self):
         self.conn.execute(f"ANALYZE {self.sql_table};")
 
     def copy_from_file(self, file_object: StringIO):
-        """COPY to PostgreSQL table using StringIO CSV object"""
+        """
+        COPY to PostgreSQL table using StringIO CSV object
+
+        Parameters
+        ----------
+        file_object: CSV formatted data to COPY from DataFrame to PostgreSQL
+        """
         cur = self.conn.connection.cursor()
         cols = ", ".join([f"{col}" for col in self.columns])
         sql = f"COPY {self.sql_table} ({cols}) FROM STDIN WITH CSV HEADER FREEZE"
         cur.copy_expert(sql=sql, file=file_object)
+
+    def data_formatting(self, df: DataFrame, functions: List[Callable] = [], **kwargs):
+        for f in functions:
+            df = f(df, copy_obj=self, **kwargs)
+        return df
diff --git a/pandas_to_postgres/copy_df.py b/pandas_to_postgres/copy_df.py
@@ -1,11 +1,4 @@
-from .utilities import (
-    create_file_object,
-    df_generator,
-    logger,
-    cast_pandas,
-    add_level_metadata,
-)
-
+from .utilities import create_file_object, df_generator, logger, cast_pandas
 from ._base_copy import BaseCopy
 
 import pandas as pd
@@ -30,18 +23,10 @@ def __init__(
         self.columns = self.df.columns
         self.rows = self.df.shape[0]
 
-    def format_df(self):
-        # Handle NaN --> None type casting
-        self.df = cast_pandas(self.df, self.table_obj)
-
-        # Add level (constant) data to frames from dict
-        if self.levels:
-            self.df = add_level_metadata(self.df, self.levels)
-
-    def copy(self):
+    def copy(self, functions=[cast_pandas]):
         self.drop_fks()
         self.drop_pk()
-        self.format_df()
+        self.df = self.data_formatting(self.df, functions=functions)
         with self.conn.begin():
             self.truncate()
 
diff --git a/pandas_to_postgres/copy_hdf.py b/pandas_to_postgres/copy_hdf.py
@@ -2,9 +2,7 @@
     create_file_object,
     df_generator,
     logger,
-    classification_to_pandas,
     cast_pandas,
-    add_level_metadata,
     HDFMetadata,
 )
 
@@ -26,9 +24,7 @@ def __init__(
         sql_table: str = None,
         csv_chunksize: int = 10 ** 6,
     ):
-        super().__init__(
-            self, defer_sql_objs, conn, table_obj, sql_table, csv_chunksize
-        )
+        super().__init__(defer_sql_objs, conn, table_obj, sql_table, csv_chunksize)
 
         self.hdf_tables = hdf_tables
 
@@ -37,35 +33,40 @@ def __init__(
         self.file_name = hdf_meta.file_name
         self.hdf_chunksize = hdf_meta.chunksize
 
-    def copy_table(self):
+    def copy(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
         self.drop_fks()
         self.drop_pk()
 
         # These need to be one transaction to use COPY FREEZE
         with self.conn.begin():
             self.truncate()
-            self.hdf_to_pg()
+            self.hdf_to_pg(
+                data_formatters=data_formatters,
+                data_formatter_kwargs=data_formatter_kwargs,
+            )
 
         self.create_pk()
         self.create_fks()
         self.analyze()
 
-    def hdf_to_pg(self):
+    def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
         if self.hdf_tables is None:
             logger.warn(f"No HDF table found for SQL table {self.sql_table}")
             return
 
         for hdf_table in self.hdf_tables:
             logger.info(f"*** {hdf_table} ***")
-            hdf_levels = self.levels.get(hdf_table)
 
             logger.info("Reading HDF table")
             df = pd.read_hdf(self.file_name, key=hdf_table)
             self.rows += len(df)
 
-            # Handle NaN --> None type casting and adding const level data
-            df = cast_pandas(df, self.table_obj)
-            df = add_level_metadata(df, hdf_levels)
+            data_formatter_kwargs["hdf_table"] = hdf_table
+
+            logger.info("Formatting data")
+            df = self.data_formatting(
+                df, functions=data_formatters, **data_formatter_kwargs
+            )
 
             if self.columns is None:
                 self.columns = df.columns
@@ -95,7 +96,6 @@ def __init__(
         csv_chunksize: int = 10 ** 6,
     ):
         super().__init__(
-            self,
             hdf_tables,
             hdf_meta,
             defer_sql_objs,
@@ -105,7 +105,7 @@ def __init__(
             csv_chunksize,
         )
 
-    def hdf_to_pg(self):
+    def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
         if self.hdf_tables is None:
             logger.warn("No HDF table found for SQL table {self.sql_table}")
             return
@@ -116,9 +116,11 @@ def hdf_to_pg(self):
             df = pd.read_hdf(self.file_name, key=hdf_table)
             self.rows += len(df)
 
-            logger.info("Formatting classification")
-            df = classification_to_pandas(df)
-            df = cast_pandas(df, self.table_obj)
+            data_formatter_kwargs["hdf_table"] = hdf_table
+            logger.info("Formatting data")
+            df = self.data_formatting(
+                df, functions=data_formatters, **data_formatter_kwargs
+            )
 
             if self.columns is None:
                 self.columns = df.columns
@@ -145,7 +147,6 @@ def __init__(
         csv_chunksize: int = 10 ** 6,
     ):
         super().__init__(
-            self,
             hdf_tables,
             hdf_meta,
             defer_sql_objs,
@@ -155,14 +156,13 @@ def __init__(
             csv_chunksize,
         )
 
-    def hdf_to_pg(self):
+    def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
         if self.hdf_tables is None:
             logger.warn(f"No HDF table found for SQL table {self.sql_table}")
             return
 
         for hdf_table in self.hdf_tables:
             logger.info(f"*** {hdf_table} ***")
-            hdf_levels = self.levels.get(hdf_table)
 
             with pd.HDFStore(self.file_name) as store:
                 nrows = store.get_storer(hdf_table).nrows
@@ -183,9 +183,11 @@ def hdf_to_pg(self):
 
                 start += self.hdf_chunksize
 
-                # Handle NaN --> None type casting and adding const level data
-                df = cast_pandas(df, self.table_obj)
-                df = add_level_metadata(df, hdf_levels)
+                data_formatter_kwargs["hdf_table"] = hdf_table
+                logger.info("Formatting data")
+                df = self.data_formatting(
+                    df, functions=data_formatters, **data_formatter_kwargs
+                )
 
                 if self.columns is None:
                     self.columns = df.columns
diff --git a/pandas_to_postgres/utilities.py b/pandas_to_postgres/utilities.py
@@ -1,4 +1,5 @@
 import logging
+from typing import List
 import pandas as pd
 from sqlalchemy.sql.schema import Table
 
@@ -16,7 +17,12 @@
 
 
 class HDFMetadata(object):
-    def __init__(self, file_name="./data.h5", keys=None, chunksize=10 ** 7):
+    def __init__(
+        self,
+        file_name: str = "./data.h5",
+        keys: List[str] = None,
+        chunksize: int = 10 ** 7,
+    ):
         self.file_name = file_name
         self.chunksize = chunksize
         self.sql_to_hdf = defaultdict(set)
@@ -42,7 +48,7 @@ def __init__(self, file_name="./data.h5", keys=None, chunksize=10 ** 7):
                     logger.warn(f"No SQL table name found for {key}")
 
 
-def create_file_object(df):
+def create_file_object(df: pd.DataFrame) -> StringIO:
     """
     Writes pandas dataframe to an in-memory StringIO file object. Adapted from
     https://gist.github.com/mangecoeur/1fbd63d4758c2ba0c470#gistcomment-2086007
@@ -53,16 +59,14 @@ def create_file_object(df):
     return file_object
 
 
-def df_generator(df, chunksize):
+def df_generator(df: pd.DataFrame, chunksize: int):
     """
     Create a generator to iterate over chunks of a dataframe
 
     Parameters
     ----------
-    df: pandas dataframe
-        dataframe to iterate over
-    chunksize: int
-        max number of rows to return in a chunk
+    df: pandas dataframe to iterate over
+    chunksize: max number of rows to return in a chunk
     """
     rows = 0
     if not df.shape[0] % chunksize:
@@ -76,29 +80,35 @@ def df_generator(df, chunksize):
         rows += chunksize
 
 
-def cast_pandas(df, sql_table):
+def cast_pandas(
+    df: pd.DataFrame, columns: list = None, copy_obj: object = None, **kwargs
+) -> pd.DataFrame:
     """
     Pandas does not handle null values in integer or boolean fields out of the
     box, so cast fields that should be these types in the database to object
     fields and change np.nan to None
 
     Parameters
     ----------
-    df: pandas dataframe
-        data frame with fields that are desired to be int or bool as float with
+    df: data frame with fields that are desired to be int or bool as float with
         np.nan that should correspond to None
 
-    sql_table: SQLAlchemy model
-        destination table object with field names corresponding to those in df
+    columns: list of SQLAlchemy Columns to iterate through to determine data types
+
+    copy_obj: instance of BaseCopy passed from the BaseCopy.data_formatting method where
+        we can access BaseCopy.table_obj.columns
 
     Returns
     -------
-    df: pandas dataframe
-        dataframe with fields that correspond to Postgres int, bigint, and bool
+    df: dataframe with fields that correspond to Postgres int, bigint, and bool
         fields changed to objects with None values for null
     """
 
-    for col in sql_table.columns:
+    if columns is None and copy_obj is None:
+        raise ValueError("One of columns or copy_obj must be supplied")
+
+    columns = columns or copy_obj.table_obj.columns
+    for col in columns:
         if str(col.type) in ["INTEGER", "BIGINT"]:
             df[col.name] = df[col.name].apply(
                 lambda x: None if pd.isna(x) else int(x), convert_dtype=False
@@ -109,59 +119,3 @@ def cast_pandas(df, sql_table):
             )
 
     return df
-
-
-def add_level_metadata(df, hdf_levels):
-    """
-    Updates dataframe fields for constant "_level" fields
-
-    Parameters
-    ----------
-    df: pandas DataFrame
-    hdf_levels: dict
-        dict of level:value fields that are constant for the entire dataframe
-
-    Returns
-    ------
-    df: pandas DataFrame
-    """
-
-    if hdf_levels:
-        logger.info("Adding level metadata values")
-        for entity, level_value in hdf_levels.items():
-            df[entity + "_level"] = level_value
-
-    return df
-
-
-def classification_to_pandas(
-    df,
-    optional_fields=[
-        "name_es",
-        "name_short_en",
-        "name_short_es",
-        "description_en",
-        "description_es",
-        "is_trusted",
-        "in_rankings",
-    ],
-):
-    """Convert a classification from the format it comes in the classification
-    file (which is the format from the 'classifications' github repository)
-    into the format that the flask apps use. Mostly just a thing for dropping
-    unneeded columns and renaming existing ones.
-
-    The optional_fields allows you to specify which fields should be considered
-    optional, i.e. it'll still work if this field doesn't exist in the
-    classification, like the description fields for example.
-    """
-
-    # Sort fields and change names appropriately
-    new_df = df[["index", "code", "name", "level", "parent_id"]]
-    new_df = new_df.rename(columns={"index": "id", "name": "name_en"})
-
-    for field in optional_fields:
-        if field in df:
-            new_df[field] = df[field]
-
-    return new_df