Updated log_exec_status and calls to it

c-simpson · c-simpson · commit 3c56d62786e5 · 2021-05-29T21:41:59.000-04:00
diff --git a/src/server/pipeline/clean_and_load_data.py b/src/server/pipeline/clean_and_load_data.py
@@ -8,7 +8,7 @@
 from flask import current_app
 import sqlalchemy
 from config import CURRENT_SOURCE_FILES_PATH
-
+from pipeline import log_db
 
 def start(connection, pdp_contacts_df, file_path_list):
     result = pd.DataFrame(columns=pdp_contacts_df.columns)
diff --git a/src/server/pipeline/flow_script.py b/src/server/pipeline/flow_script.py
@@ -1,7 +1,8 @@
 import os
+import datetime, time
 import pandas as pd
 from flask import current_app
-from pipeline import calssify_new_data, clean_and_load_data, archive_rows, match_data
+from pipeline import calssify_new_data, clean_and_load_data, archive_rows, match_data, log_db
 from config import CURRENT_SOURCE_FILES_PATH
 from config import engine
 from models import Base
@@ -10,6 +11,9 @@
 def start_flow():
     file_path_list = os.listdir(CURRENT_SOURCE_FILES_PATH)
 
+    job_id = str(int(time.time()))
+    log_db.log_exec_status(job_id, 'start_flow', 'executing', '')
+
     if file_path_list:
         with engine.connect() as connection:
             Base.metadata.create_all(connection)
@@ -25,6 +29,7 @@ def start_flow():
             # Populate new records in secondary tables (donations, volunteer shifts)
             # input - existing files in path
             # output - normalized object of all entries, as well as the input json rows for primary sources
+            log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
             normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
 
             # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
@@ -34,14 +39,15 @@ def start_flow():
             normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
 
             # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
+            log_db.log_exec_status(job_id, 'classify', 'executing', '')
             rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
 
             # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
             archive_rows.archive(connection, rows_classified["updated"])
 
             # Match new+updated records against previous version of pdp_contacts database, and
             # write these rows to the database.
-            match_data.start(connection, rows_classified, manual_matches_df)
+            match_data.start(connection, rows_classified, manual_matches_df, job_id)
 
             # Copy raw input rows to json fields in pdp_contacts,
             # using a temporary table to simplify the update code.
@@ -60,3 +66,4 @@ def start_flow():
 
             current_app.logger.info('Finished flow script run')
 
+    log_db.log_exec_status(job_id, 'flow', 'complete', '' )
diff --git a/src/server/pipeline/log_db.py b/src/server/pipeline/log_db.py
@@ -11,24 +11,37 @@
 
 metadata = MetaData()
 
-kvt = Table("kv_unique", metadata, autoload=True, autoload_with=engine)
+ex_stat = Table("execution_status", metadata, autoload=True, autoload_with=engine)
 
+# Alembic version bfb1262d3195
 
+# CREATE TABLE public.execution_status (
+# 	"_id" serial NOT NULL,
+# 	job_id int4 NOT NULL,
+# 	stage varchar(32) NOT NULL,
+# 	status varchar(32) NOT NULL,
+# 	details varchar(128) NOT NULL,
+# 	update_stamp timestamp NOT NULL DEFAULT now(),
+# 	CONSTRAINT execution_status_pkey null
+# );
 
-def log_exec_status(job_id: str, job_status: dict):
 
-    # Write Last Execution stats to DB  
-    # See Alembic Revision ID: 05e0693f8cbb for table definition
+
+def log_exec_status(job_id: str, exec_stage: str, exec_status: str, job_details: str):
+    """Log execution status (job_id, status, job_details) to DB """
+
     with engine.connect() as connection:
-        ins_stmt = insert(kvt).values(               # Postgres-specific insert() supporting ON CONFLICT 
-            keycol = 'job-' + job_id,
-            valcol = json.dumps(job_status)
+        ins_stmt = insert(ex_stat).values(               # Postgres-specific insert() supporting ON CONFLICT 
+            job_id =  job_id,
+            stage = exec_stage, 
+            status = exec_status,
+            details = json.dumps(job_details)
             )
 
         # If key already present in DB, do update instead 
         upsert = ins_stmt.on_conflict_do_update(
-                constraint='kv_unique_keycol_key',
-                set_=dict(valcol=json.dumps(job_status))
+                constraint='uq_job_id',
+                set_=dict( stage = exec_stage, status = exec_status,  details = json.dumps(job_details))
                 )
 
         try:
diff --git a/src/server/pipeline/match_data.py b/src/server/pipeline/match_data.py
@@ -15,15 +15,16 @@ def normalize_before_match(value):
     return result
 
 
-def start(connection, added_or_updated_rows, manual_matches_df):
+def start(connection, added_or_updated_rows, manual_matches_df, job_id):
     # Match new records to each other and existing pdp_contacts data.
     # Assigns matching ID's to records, as well.
     # WARNING: not thread-safe and could lead to concurrency issues if two users /execute simultaneously
     current_app.logger.info('Start record matching')
     # Will need to consider updating the existing row contents (filter by active), deactivate,
     # try to match, and merge previous matching groups if applicable
-    job_id = str(int(time.time()))
-    log_db.log_exec_status(job_id, {'status': 'starting', 'at_row': 0, 'of_rows': 0})
+    # job_id = str(int(time.time()))
+    log_db.log_exec_status(job_id, 'matching', 'executing', '')
+
     current_app.logger.info("***** Running execute job ID " + job_id + " *****")
     items_to_update = pd.concat([added_or_updated_rows["new"], added_or_updated_rows["updated"]], ignore_index=True)
     pdp_contacts = pd.read_sql_table('pdp_contacts', connection)
@@ -55,9 +56,7 @@ def start(connection, added_or_updated_rows, manual_matches_df):
             current_app.logger.info("- Matching rows {}-{} of {}".format(
                 row_num + 1, min(len(rows), row_num + row_print_freq), len(rows))
             )
-            log_db.log_exec_status(job_id, {
-                'status': 'executing', 'at_row': row_num + 1, 'of_rows': len(rows)
-            })
+            log_db.log_exec_status(job_id, 'matching', 'executing', str({'at_row': row_num + 1, 'of_rows': len(rows)  }) )
 
         # Exact matches based on specified columns
         row_matches = pdp_contacts[
@@ -103,4 +102,4 @@ def start(connection, added_or_updated_rows, manual_matches_df):
     items_to_update.to_sql('pdp_contacts', connection, index=False, if_exists='append')
     current_app.logger.info("- Finished load to pdp_contacts table")
 
-    log_db.log_exec_status(job_id, {'status': 'complete', 'at_row': len(rows), 'of_rows': len(rows)})
+    log_db.log_exec_status(job_id, 'matching', 'executing', str({'at_row': len(rows), 'of_rows': len(rows) }) )