Ensures only single execute can run

c-simpson · c-simpson · commit b5f1e3c04cf3 · 2021-05-29T23:02:14.000-04:00
flow_script requests job_id, bails if not available because
there's a job already running.
diff --git a/src/server/api/admin_api.py b/src/server/api/admin_api.py
@@ -1,5 +1,6 @@
 from api.api import admin_api
 import os
+import time
 from datetime import datetime
 import json
 from sqlalchemy.sql import text
@@ -156,19 +157,59 @@ def get_exec_status(job_id):
 
     return exec_status
 
+def start_job():
+    """If no running jobs, create a job_id and execution status entry.
+    This ensures only only one job runs at a time.
+    If there's a running job, return None.  """
 
 
+    engine.dispose() # we don't want other process's conn pool
 
-"""
-@admin_api.route('/api/status', methods=['GET'])
-def checkStatus():
-    with engine.connect() as connection:
-        query = text("SELECT now()")
-        query_result = connection.execute(query)
+    job_id = str(int(time.time()))
+    q = text("""SELECT job_id from execution_status
+                    WHERE status = 'executing' """)
 
-        # Need to iterate over the results proxy
-        results = {}
-        for row in query_result:
-            results = dict(row)
-        return jsonify(results)
-"""
+    i = text("""INSERT INTO execution_status (job_id, stage, status, details) 
+                values(:j, :stg, :stat, :det) """)
+    i = i.bindparams(j = job_id, 
+                     stg ='initiating',
+                     stat ='executing',
+                     det = ''   )
+
+    running_job = None
+
+    with engine.begin() as connection:   # BEGIN TRANSACTION
+        q_result = connection.execute(q)
+        if q_result.rowcount == 0:
+            # No running jobs
+            ins_result = connection.execute(i)
+        else:
+            running_job = q_result.fetchone()[0]
+    # COMMIT TRANSACTION
+    #TODO: what would an exception look like here? 
+
+
+    if running_job :
+        # There was a running job already
+        current_app.logger.info("Request to start job, but job_id " + str(running_job) + " already executing")
+        return None
+    else:
+        current_app.logger.info("Assigned job_id " + job_id )
+        return job_id
+
+
+
+
+# """
+# @admin_api.route('/api/status', methods=['GET'])
+# def checkStatus():
+#     with engine.connect() as connection:
+#         query = text("SELECT now()")
+#         query_result = connection.execute(query)
+
+#         # Need to iterate over the results proxy
+#         results = {}
+#         for row in query_result:
+#             results = dict(row)
+#         return jsonify(results)
+# """
diff --git a/src/server/pipeline/flow_script.py b/src/server/pipeline/flow_script.py
@@ -1,69 +1,77 @@
 import os
-import datetime, time
+
 import pandas as pd
 from flask import current_app
+from api import admin_api
 from pipeline import calssify_new_data, clean_and_load_data, archive_rows, match_data, log_db
 from config import CURRENT_SOURCE_FILES_PATH
 from config import engine
 from models import Base
 
 
 def start_flow():
-    file_path_list = os.listdir(CURRENT_SOURCE_FILES_PATH)
-
-    job_id = str(int(time.time()))
-    log_db.log_exec_status(job_id, 'start_flow', 'executing', '')
-
-    if file_path_list:
-        with engine.connect() as connection:
-            Base.metadata.create_all(connection)
-
-            # Get previous version of pdp_contacts table, which is used later to classify new records
-            pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
-            pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
-            pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
-
-            current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
-
-            # Clean the input data and normalize/rename columns
-            # Populate new records in secondary tables (donations, volunteer shifts)
-            # input - existing files in path
-            # output - normalized object of all entries, as well as the input json rows for primary sources
-            log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
-            normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
-
-            # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
-            # (If additional inconsistencies are encountered, may need to enforce the schema of
-            # the contacts loader by initializing it from pdp_contacts.)
-            normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
-            normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
-
-            # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
-            log_db.log_exec_status(job_id, 'classify', 'executing', '')
-            rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
-
-            # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
-            archive_rows.archive(connection, rows_classified["updated"])
-
-            # Match new+updated records against previous version of pdp_contacts database, and
-            # write these rows to the database.
-            match_data.start(connection, rows_classified, manual_matches_df, job_id)
-
-            # Copy raw input rows to json fields in pdp_contacts,
-            # using a temporary table to simplify the update code.
-            current_app.logger.info('Saving json of original rows to pdp_contacts')
-            source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
-            # https://www.postgresql.org/docs/8.4/sql-update.html
-            connection.execute('''
-                UPDATE pdp_contacts pdp
-                SET json = to_json(temp.json)
-                FROM _temp_pdp_contacts_loader temp
-                WHERE
-                    pdp.source_type = temp.source_type AND
-                    pdp.source_id = temp.source_id AND
-                    pdp.archived_date IS NULL
-            ''')
-
-            current_app.logger.info('Finished flow script run')
-
-    log_db.log_exec_status(job_id, 'flow', 'complete', '' )
+
+    job_id = admin_api.start_job()
+
+    if (not job_id):
+        current_app.logger.info('Failed to get job_id')
+    else:
+        log_db.log_exec_status(job_id, 'start_flow', 'executing', '')
+
+        file_path_list = os.listdir(CURRENT_SOURCE_FILES_PATH)
+
+
+
+        if file_path_list:
+            with engine.connect() as connection:
+                Base.metadata.create_all(connection)
+
+                # Get previous version of pdp_contacts table, which is used later to classify new records
+                pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
+                pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
+                pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
+
+                current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
+
+                # Clean the input data and normalize/rename columns
+                # Populate new records in secondary tables (donations, volunteer shifts)
+                # input - existing files in path
+                # output - normalized object of all entries, as well as the input json rows for primary sources
+                log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
+                normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
+
+                # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
+                # (If additional inconsistencies are encountered, may need to enforce the schema of
+                # the contacts loader by initializing it from pdp_contacts.)
+                normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
+                normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
+
+                # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
+                log_db.log_exec_status(job_id, 'classify', 'executing', '')
+                rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
+
+                # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
+                archive_rows.archive(connection, rows_classified["updated"])
+
+                # Match new+updated records against previous version of pdp_contacts database, and
+                # write these rows to the database.
+                match_data.start(connection, rows_classified, manual_matches_df, job_id)
+
+                # Copy raw input rows to json fields in pdp_contacts,
+                # using a temporary table to simplify the update code.
+                current_app.logger.info('Saving json of original rows to pdp_contacts')
+                source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
+                # https://www.postgresql.org/docs/8.4/sql-update.html
+                connection.execute('''
+                    UPDATE pdp_contacts pdp
+                    SET json = to_json(temp.json)
+                    FROM _temp_pdp_contacts_loader temp
+                    WHERE
+                        pdp.source_type = temp.source_type AND
+                        pdp.source_id = temp.source_id AND
+                        pdp.archived_date IS NULL
+                ''')
+
+                current_app.logger.info('Finished flow script run')
+
+        log_db.log_exec_status(job_id, 'flow', 'complete', '' )