Merge pull request #332 from CodeForPhilly/313-long-exec

c-simpson · web-flow · commit 53d9f904dfd6 · 2021-05-31T18:46:02.000-04:00
313 long exec
diff --git a/documentation/documentation-images/exec_status.svg b/documentation/documentation-images/exec_status.svg
diff --git a/src/server/alembic/versions/05e0693f8cbb_key_value_table.py b/src/server/alembic/versions/05e0693f8cbb_key_value_table.py
@@ -28,4 +28,4 @@ def upgrade():
 
 
 def downgrade():
-    pass
+    op.drop_table('kv_unique')
diff --git a/src/server/alembic/versions/36c4ecbfd11a_add_pdp_users_full_name.py b/src/server/alembic/versions/36c4ecbfd11a_add_pdp_users_full_name.py
@@ -21,4 +21,4 @@ def upgrade():
 
 
 def downgrade():
-    pass
+    op.drop_column("pdp_users", "full_name")
diff --git a/src/server/alembic/versions/6b8cf99be000_add_user_journal_table.py b/src/server/alembic/versions/6b8cf99be000_add_user_journal_table.py
@@ -29,4 +29,4 @@ def upgrade():
 
 
 def downgrade():
-    pass
+    op.drop_table('pdp_user_journal')
diff --git a/src/server/alembic/versions/bfb1262d3195_create_execution_status_table.py b/src/server/alembic/versions/bfb1262d3195_create_execution_status_table.py
@@ -0,0 +1,48 @@
+"""create execution status table
+
+Revision ID: bfb1262d3195
+Revises: 05e0693f8cbb
+Create Date: 2021-05-28 16:12:40.561829
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.sql.sqltypes import Integer
+from sqlalchemy.sql import func
+
+# revision identifiers, used by Alembic.
+revision = 'bfb1262d3195'
+down_revision = '05e0693f8cbb'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.create_table (
+        "execution_status",
+        sa.Column("_id", sa.Integer, primary_key=True),
+        sa.Column("job_id", sa.Integer,  nullable=False),
+        sa.Column("stage", sa.String(32),   nullable=False),
+        sa.Column("status", sa.String(32),   nullable=False),
+        sa.Column("details", sa.String(128),  nullable=False),
+        sa.Column("update_stamp", sa.DateTime,  nullable=False, server_default=func.now())
+    )
+
+    op.execute("""CREATE FUNCTION last_upd_trig() RETURNS trigger
+                LANGUAGE plpgsql AS
+                $$BEGIN
+                NEW.update_stamp := current_timestamp;
+                RETURN NEW;
+                END;$$;""")
+
+    op.execute("""CREATE TRIGGER last_upd_trigger
+                BEFORE INSERT OR UPDATE ON execution_status
+                FOR EACH ROW
+                EXECUTE PROCEDURE last_upd_trig();"""
+                )   # Postgres-specific, obviously 
+
+    op.create_unique_constraint("uq_job_id",  "execution_status", ["job_id"])
+
+def downgrade():
+    op.drop_table("execution_status")
+    op.execute("DROP FUNCTION last_upd_trig()")
diff --git a/src/server/api/.optic/.gitignore b/src/server/api/.optic/.gitignore
@@ -0,0 +1,2 @@
+
+captures/
diff --git a/src/server/api/admin_api.py b/src/server/api/admin_api.py
@@ -1,5 +1,6 @@
 from api.api import admin_api
 import os
+import time
 from datetime import datetime
 import json
 from sqlalchemy.sql import text
@@ -58,8 +59,11 @@ def list_current_files():
 @jwt_ops.admin_required
 def execute():
     current_app.logger.info("Execute flow")
-    flow_script.start_flow()
+    job_outcome = flow_script.start_flow() # 'busy', 'completed', or 'nothing to do'
+    current_app.logger.info("Job outcome: " + str(job_outcome))
 
+
+    # --------   Skip update if 'busy' or 'nothing to do' as nothing changed ? ------
     current_time = datetime.now().ctime()
     statistics = get_statistics()
 
@@ -87,8 +91,19 @@ def execute():
         except Exception as e:
             current_app.logger.error("Insert/Update failed on Last Execution stats")
             current_app.logger.exception(e)
+    # -------------------------------------------------------------------------------
+    
+    if job_outcome == 'busy':
+        return jsonify({'outcome' : 'Already analyzing'}), 503   
+
+    elif job_outcome == 'nothing to do':
+        return jsonify({'outcome' : 'No uploaded files to process'}), 200
+
+    elif job_outcome == 'completed' :
+        return jsonify({'outcome' : 'Analysis completed'}), 200
 
-    return jsonify(success=True)
+    else:
+        return jsonify({'outcome' : 'Unknown status: ' + str(job_outcome)}), 200
 
 
 def get_statistics():
@@ -134,41 +149,96 @@ def list_statistics():
     return last_execution_details
 
 
-@admin_api.route("/api/get_execution_status/<int:job_id>", methods=["GET"])
+@admin_api.route("/api/get_execution_status", methods=["GET"])
 @jwt_ops.admin_required
-def get_exec_status(job_id):
-    """ Get the execution status record from the DB for the specified job_id """
+def get_exec_status():
+    """ Get the execution status record from the DB for a running job, if present"""
 
 
     engine.dispose() # we don't want other process's conn pool
 
     with engine.connect() as connection:
+        q = text("""SELECT job_id, stage, status, details, update_stamp 
+                    FROM execution_status 
+                    WHERE status = 'executing' """)
+        result = connection.execute(q)
 
-        s_jobid = 'job-' + str(job_id)        
-        s = text("select valcol from kv_unique where keycol = :j ;")
-        s = s.bindparams(j=s_jobid)
-        result = connection.execute(s)
         if result.rowcount > 0:
-            exec_status  = result.fetchone()[0]
+           running_job = result.fetchone()
+           return jsonify(dict(zip(result.keys(), running_job)))
         else:
-            current_app.logger.warning("0 results for exec status query")
-            exec_status = '{}'
+            return jsonify('')
 
-    return exec_status
+@admin_api.route("/api/job_in_progress", methods=["GET"])
+@jwt_ops.admin_required
+def is_job_in_progresss():
+    """Return True if there's a running execute, False if not. """
 
+    engine.dispose() # we don't want other process's conn pool
 
+    with engine.connect() as connection:
+        q = text("""SELECT job_id from execution_status WHERE status = 'executing' """)
+        result = connection.execute(q)
 
+        if result.rowcount > 0:
+            return jsonify(True)
+        else:
+            return jsonify(False)
 
-"""
-@admin_api.route('/api/status', methods=['GET'])
-def checkStatus():
-    with engine.connect() as connection:
-        query = text("SELECT now()")
-        query_result = connection.execute(query)
 
-        # Need to iterate over the results proxy
-        results = {}
-        for row in query_result:
-            results = dict(row)
-        return jsonify(results)
-"""
+def start_job():
+    """If no running jobs, create a job_id and execution status entry.
+    This ensures only only one job runs at a time.
+    If there's a running job, return None.  """
+
+
+    engine.dispose() # we don't want other process's conn pool
+
+    job_id = str(int(time.time()))
+    q = text("""SELECT job_id from execution_status
+                    WHERE status = 'executing' """)
+
+    i = text("""INSERT INTO execution_status (job_id, stage, status, details) 
+                values(:j, :stg, :stat, :det) """)
+    i = i.bindparams(j = job_id, 
+                     stg ='initiating',
+                     stat ='executing',
+                     det = ''   )
+
+    running_job = None
+
+    with engine.begin() as connection:   # BEGIN TRANSACTION
+        q_result = connection.execute(q)
+        if q_result.rowcount == 0:
+            # No running jobs
+            ins_result = connection.execute(i)
+        else:
+            running_job = q_result.fetchone()[0]
+    # COMMIT TRANSACTION
+    #TODO: what would an exception look like here? 
+
+
+    if running_job :
+        # There was a running job already
+        current_app.logger.info("Request to start job, but job_id " + str(running_job) + " already executing")
+        return None
+    else:
+        current_app.logger.info("Assigned job_id " + job_id )
+        return job_id
+
+
+
+
+# """
+# @admin_api.route('/api/status', methods=['GET'])
+# def checkStatus():
+#     with engine.connect() as connection:
+#         query = text("SELECT now()")
+#         query_result = connection.execute(query)
+
+#         # Need to iterate over the results proxy
+#         results = {}
+#         for row in query_result:
+#             results = dict(row)
+#         return jsonify(results)
+# """
diff --git a/src/server/pipeline/clean_and_load_data.py b/src/server/pipeline/clean_and_load_data.py
@@ -8,7 +8,7 @@
 from flask import current_app
 import sqlalchemy
 from config import CURRENT_SOURCE_FILES_PATH
-
+from pipeline import log_db
 
 def start(connection, pdp_contacts_df, file_path_list):
     result = pd.DataFrame(columns=pdp_contacts_df.columns)
diff --git a/src/server/pipeline/flow_script.py b/src/server/pipeline/flow_script.py
@@ -1,62 +1,86 @@
 import os
+
 import pandas as pd
 from flask import current_app
-from pipeline import calssify_new_data, clean_and_load_data, archive_rows, match_data
+from api import admin_api
+from pipeline import calssify_new_data, clean_and_load_data, archive_rows, match_data, log_db
 from config import CURRENT_SOURCE_FILES_PATH
 from config import engine
 from models import Base
 
 
 def start_flow():
-    file_path_list = os.listdir(CURRENT_SOURCE_FILES_PATH)
-
-    if file_path_list:
-        with engine.connect() as connection:
-            Base.metadata.create_all(connection)
-
-            # Get previous version of pdp_contacts table, which is used later to classify new records
-            pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
-            pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
-            pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
-
-            current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
-
-            # Clean the input data and normalize/rename columns
-            # Populate new records in secondary tables (donations, volunteer shifts)
-            # input - existing files in path
-            # output - normalized object of all entries, as well as the input json rows for primary sources
-            normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
-
-            # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
-            # (If additional inconsistencies are encountered, may need to enforce the schema of
-            # the contacts loader by initializing it from pdp_contacts.)
-            normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
-            normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
-
-            # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
-            rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
-
-            # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
-            archive_rows.archive(connection, rows_classified["updated"])
-
-            # Match new+updated records against previous version of pdp_contacts database, and
-            # write these rows to the database.
-            match_data.start(connection, rows_classified, manual_matches_df)
-
-            # Copy raw input rows to json fields in pdp_contacts,
-            # using a temporary table to simplify the update code.
-            current_app.logger.info('Saving json of original rows to pdp_contacts')
-            source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
-            # https://www.postgresql.org/docs/8.4/sql-update.html
-            connection.execute('''
-                UPDATE pdp_contacts pdp
-                SET json = to_json(temp.json)
-                FROM _temp_pdp_contacts_loader temp
-                WHERE
-                    pdp.source_type = temp.source_type AND
-                    pdp.source_id = temp.source_id AND
-                    pdp.archived_date IS NULL
-            ''')
+
+    job_id = admin_api.start_job()
+
+    if (not job_id):
+        current_app.logger.info('Failed to get job_id')
+        job_outcome = 'busy'
+
+    else:
+        log_db.log_exec_status(job_id, 'start_flow', 'executing', '')
+
+        file_path_list = os.listdir(CURRENT_SOURCE_FILES_PATH)
+
+
+
+        if file_path_list:
+            with engine.connect() as connection:
+                Base.metadata.create_all(connection)
+
+                # Get previous version of pdp_contacts table, which is used later to classify new records
+                pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
+                pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
+                pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
+
+                current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
+
+                # Clean the input data and normalize/rename columns
+                # Populate new records in secondary tables (donations, volunteer shifts)
+                # input - existing files in path
+                # output - normalized object of all entries, as well as the input json rows for primary sources
+                log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
+                normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
+
+                # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
+                # (If additional inconsistencies are encountered, may need to enforce the schema of
+                # the contacts loader by initializing it from pdp_contacts.)
+                normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
+                normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
+
+                # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
+                log_db.log_exec_status(job_id, 'classify', 'executing', '')
+                rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
+
+                # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
+                archive_rows.archive(connection, rows_classified["updated"])
+
+                # Match new+updated records against previous version of pdp_contacts database, and
+                # write these rows to the database.
+                match_data.start(connection, rows_classified, manual_matches_df, job_id)
+
+                # Copy raw input rows to json fields in pdp_contacts,
+                # using a temporary table to simplify the update code.
+                current_app.logger.info('Saving json of original rows to pdp_contacts')
+                source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
+                # https://www.postgresql.org/docs/8.4/sql-update.html
+                connection.execute('''
+                    UPDATE pdp_contacts pdp
+                    SET json = to_json(temp.json)
+                    FROM _temp_pdp_contacts_loader temp
+                    WHERE
+                        pdp.source_type = temp.source_type AND
+                        pdp.source_id = temp.source_id AND
+                        pdp.archived_date IS NULL
+                ''')
 
             current_app.logger.info('Finished flow script run')
+            job_outcome = 'completed'
+
+        else: # No files in list
+            current_app.logger.info('No files to process')
+            job_outcome = 'nothing to do'  
+
+        log_db.log_exec_status(job_id, 'flow', 'complete', '' )
 
+    return job_outcome
diff --git a/src/server/pipeline/log_db.py b/src/server/pipeline/log_db.py
diff --git a/src/server/pipeline/match_data.py b/src/server/pipeline/match_data.py

Original file line number	Diff line number	Diff line change
`@@ -28,4 +28,4 @@ def upgrade():`
`28`	`28`
`29`	`29`
`30`	`30`	`def downgrade():`
`31`		`- pass`
	`31`	`+ op.drop_table('kv_unique')`
Original file line number	Diff line number	Diff line change
`@@ -21,4 +21,4 @@ def upgrade():`
`21`	`21`
`22`	`22`
`23`	`23`	`def downgrade():`
`24`		`- pass`
	`24`	`+ op.drop_column("pdp_users", "full_name")`
Original file line number	Diff line number	Diff line change
`@@ -29,4 +29,4 @@ def upgrade():`
`29`	`29`
`30`	`30`
`31`	`31`	`def downgrade():`
`32`		`- pass`
	`32`	`+ op.drop_table('pdp_user_journal')`