1
1
import os
2
+
2
3
import pandas as pd
3
4
from flask import current_app
4
- from pipeline import calssify_new_data , clean_and_load_data , archive_rows , match_data
5
+ from api import admin_api
6
+ from pipeline import calssify_new_data , clean_and_load_data , archive_rows , match_data , log_db
5
7
from config import CURRENT_SOURCE_FILES_PATH
6
8
from config import engine
7
9
from models import Base
8
10
9
11
10
12
def start_flow ():
11
- file_path_list = os .listdir (CURRENT_SOURCE_FILES_PATH )
12
-
13
- if file_path_list :
14
- with engine .connect () as connection :
15
- Base .metadata .create_all (connection )
16
-
17
- # Get previous version of pdp_contacts table, which is used later to classify new records
18
- pdp_contacts_df = pd .read_sql_table ('pdp_contacts' , connection )
19
- pdp_contacts_df = pdp_contacts_df [pdp_contacts_df ["archived_date" ].isnull ()]
20
- pdp_contacts_df = pdp_contacts_df .drop (columns = ['archived_date' , 'created_date' , '_id' , 'matching_id' ])
21
-
22
- current_app .logger .info ('Loaded {} records from pdp_contacts table' .format (pdp_contacts_df .shape [0 ]))
23
-
24
- # Clean the input data and normalize/rename columns
25
- # Populate new records in secondary tables (donations, volunteer shifts)
26
- # input - existing files in path
27
- # output - normalized object of all entries, as well as the input json rows for primary sources
28
- normalized_data , source_json , manual_matches_df = clean_and_load_data .start (connection , pdp_contacts_df , file_path_list )
29
-
30
- # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
31
- # (If additional inconsistencies are encountered, may need to enforce the schema of
32
- # the contacts loader by initializing it from pdp_contacts.)
33
- normalized_data .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
34
- normalized_data = pd .read_sql_table ('_temp_pdp_contacts_loader' , connection )
35
-
36
- # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
37
- rows_classified = calssify_new_data .start (pdp_contacts_df , normalized_data )
38
-
39
- # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
40
- archive_rows .archive (connection , rows_classified ["updated" ])
41
-
42
- # Match new+updated records against previous version of pdp_contacts database, and
43
- # write these rows to the database.
44
- match_data .start (connection , rows_classified , manual_matches_df )
45
-
46
- # Copy raw input rows to json fields in pdp_contacts,
47
- # using a temporary table to simplify the update code.
48
- current_app .logger .info ('Saving json of original rows to pdp_contacts' )
49
- source_json .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
50
- # https://www.postgresql.org/docs/8.4/sql-update.html
51
- connection .execute ('''
52
- UPDATE pdp_contacts pdp
53
- SET json = to_json(temp.json)
54
- FROM _temp_pdp_contacts_loader temp
55
- WHERE
56
- pdp.source_type = temp.source_type AND
57
- pdp.source_id = temp.source_id AND
58
- pdp.archived_date IS NULL
59
- ''' )
13
+
14
+ job_id = admin_api .start_job ()
15
+
16
+ if (not job_id ):
17
+ current_app .logger .info ('Failed to get job_id' )
18
+ job_outcome = 'busy'
19
+
20
+ else :
21
+ log_db .log_exec_status (job_id , 'start_flow' , 'executing' , '' )
22
+
23
+ file_path_list = os .listdir (CURRENT_SOURCE_FILES_PATH )
24
+
25
+
26
+
27
+ if file_path_list :
28
+ with engine .connect () as connection :
29
+ Base .metadata .create_all (connection )
30
+
31
+ # Get previous version of pdp_contacts table, which is used later to classify new records
32
+ pdp_contacts_df = pd .read_sql_table ('pdp_contacts' , connection )
33
+ pdp_contacts_df = pdp_contacts_df [pdp_contacts_df ["archived_date" ].isnull ()]
34
+ pdp_contacts_df = pdp_contacts_df .drop (columns = ['archived_date' , 'created_date' , '_id' , 'matching_id' ])
35
+
36
+ current_app .logger .info ('Loaded {} records from pdp_contacts table' .format (pdp_contacts_df .shape [0 ]))
37
+
38
+ # Clean the input data and normalize/rename columns
39
+ # Populate new records in secondary tables (donations, volunteer shifts)
40
+ # input - existing files in path
41
+ # output - normalized object of all entries, as well as the input json rows for primary sources
42
+ log_db .log_exec_status (job_id , 'clean_and_load' , 'executing' , '' )
43
+ normalized_data , source_json , manual_matches_df = clean_and_load_data .start (connection , pdp_contacts_df , file_path_list )
44
+
45
+ # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
46
+ # (If additional inconsistencies are encountered, may need to enforce the schema of
47
+ # the contacts loader by initializing it from pdp_contacts.)
48
+ normalized_data .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
49
+ normalized_data = pd .read_sql_table ('_temp_pdp_contacts_loader' , connection )
50
+
51
+ # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
52
+ log_db .log_exec_status (job_id , 'classify' , 'executing' , '' )
53
+ rows_classified = calssify_new_data .start (pdp_contacts_df , normalized_data )
54
+
55
+ # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
56
+ archive_rows .archive (connection , rows_classified ["updated" ])
57
+
58
+ # Match new+updated records against previous version of pdp_contacts database, and
59
+ # write these rows to the database.
60
+ match_data .start (connection , rows_classified , manual_matches_df , job_id )
61
+
62
+ # Copy raw input rows to json fields in pdp_contacts,
63
+ # using a temporary table to simplify the update code.
64
+ current_app .logger .info ('Saving json of original rows to pdp_contacts' )
65
+ source_json .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
66
+ # https://www.postgresql.org/docs/8.4/sql-update.html
67
+ connection .execute ('''
68
+ UPDATE pdp_contacts pdp
69
+ SET json = to_json(temp.json)
70
+ FROM _temp_pdp_contacts_loader temp
71
+ WHERE
72
+ pdp.source_type = temp.source_type AND
73
+ pdp.source_id = temp.source_id AND
74
+ pdp.archived_date IS NULL
75
+ ''' )
60
76
61
77
current_app .logger .info ('Finished flow script run' )
78
+ job_outcome = 'completed'
79
+
80
+ else : # No files in list
81
+ current_app .logger .info ('No files to process' )
82
+ job_outcome = 'nothing to do'
83
+
84
+ log_db .log_exec_status (job_id , 'flow' , 'complete' , '' )
62
85
86
+ return job_outcome
0 commit comments