1
1
import os
2
- import datetime , time
2
+
3
3
import pandas as pd
4
4
from flask import current_app
5
+ from api import admin_api
5
6
from pipeline import calssify_new_data , clean_and_load_data , archive_rows , match_data , log_db
6
7
from config import CURRENT_SOURCE_FILES_PATH
7
8
from config import engine
8
9
from models import Base
9
10
10
11
11
12
def start_flow ():
12
- file_path_list = os .listdir (CURRENT_SOURCE_FILES_PATH )
13
-
14
- job_id = str (int (time .time ()))
15
- log_db .log_exec_status (job_id , 'start_flow' , 'executing' , '' )
16
-
17
- if file_path_list :
18
- with engine .connect () as connection :
19
- Base .metadata .create_all (connection )
20
-
21
- # Get previous version of pdp_contacts table, which is used later to classify new records
22
- pdp_contacts_df = pd .read_sql_table ('pdp_contacts' , connection )
23
- pdp_contacts_df = pdp_contacts_df [pdp_contacts_df ["archived_date" ].isnull ()]
24
- pdp_contacts_df = pdp_contacts_df .drop (columns = ['archived_date' , 'created_date' , '_id' , 'matching_id' ])
25
-
26
- current_app .logger .info ('Loaded {} records from pdp_contacts table' .format (pdp_contacts_df .shape [0 ]))
27
-
28
- # Clean the input data and normalize/rename columns
29
- # Populate new records in secondary tables (donations, volunteer shifts)
30
- # input - existing files in path
31
- # output - normalized object of all entries, as well as the input json rows for primary sources
32
- log_db .log_exec_status (job_id , 'clean_and_load' , 'executing' , '' )
33
- normalized_data , source_json , manual_matches_df = clean_and_load_data .start (connection , pdp_contacts_df , file_path_list )
34
-
35
- # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
36
- # (If additional inconsistencies are encountered, may need to enforce the schema of
37
- # the contacts loader by initializing it from pdp_contacts.)
38
- normalized_data .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
39
- normalized_data = pd .read_sql_table ('_temp_pdp_contacts_loader' , connection )
40
-
41
- # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
42
- log_db .log_exec_status (job_id , 'classify' , 'executing' , '' )
43
- rows_classified = calssify_new_data .start (pdp_contacts_df , normalized_data )
44
-
45
- # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
46
- archive_rows .archive (connection , rows_classified ["updated" ])
47
-
48
- # Match new+updated records against previous version of pdp_contacts database, and
49
- # write these rows to the database.
50
- match_data .start (connection , rows_classified , manual_matches_df , job_id )
51
-
52
- # Copy raw input rows to json fields in pdp_contacts,
53
- # using a temporary table to simplify the update code.
54
- current_app .logger .info ('Saving json of original rows to pdp_contacts' )
55
- source_json .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
56
- # https://www.postgresql.org/docs/8.4/sql-update.html
57
- connection .execute ('''
58
- UPDATE pdp_contacts pdp
59
- SET json = to_json(temp.json)
60
- FROM _temp_pdp_contacts_loader temp
61
- WHERE
62
- pdp.source_type = temp.source_type AND
63
- pdp.source_id = temp.source_id AND
64
- pdp.archived_date IS NULL
65
- ''' )
66
-
67
- current_app .logger .info ('Finished flow script run' )
68
-
69
- log_db .log_exec_status (job_id , 'flow' , 'complete' , '' )
13
+
14
+ job_id = admin_api .start_job ()
15
+
16
+ if (not job_id ):
17
+ current_app .logger .info ('Failed to get job_id' )
18
+ else :
19
+ log_db .log_exec_status (job_id , 'start_flow' , 'executing' , '' )
20
+
21
+ file_path_list = os .listdir (CURRENT_SOURCE_FILES_PATH )
22
+
23
+
24
+
25
+ if file_path_list :
26
+ with engine .connect () as connection :
27
+ Base .metadata .create_all (connection )
28
+
29
+ # Get previous version of pdp_contacts table, which is used later to classify new records
30
+ pdp_contacts_df = pd .read_sql_table ('pdp_contacts' , connection )
31
+ pdp_contacts_df = pdp_contacts_df [pdp_contacts_df ["archived_date" ].isnull ()]
32
+ pdp_contacts_df = pdp_contacts_df .drop (columns = ['archived_date' , 'created_date' , '_id' , 'matching_id' ])
33
+
34
+ current_app .logger .info ('Loaded {} records from pdp_contacts table' .format (pdp_contacts_df .shape [0 ]))
35
+
36
+ # Clean the input data and normalize/rename columns
37
+ # Populate new records in secondary tables (donations, volunteer shifts)
38
+ # input - existing files in path
39
+ # output - normalized object of all entries, as well as the input json rows for primary sources
40
+ log_db .log_exec_status (job_id , 'clean_and_load' , 'executing' , '' )
41
+ normalized_data , source_json , manual_matches_df = clean_and_load_data .start (connection , pdp_contacts_df , file_path_list )
42
+
43
+ # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
44
+ # (If additional inconsistencies are encountered, may need to enforce the schema of
45
+ # the contacts loader by initializing it from pdp_contacts.)
46
+ normalized_data .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
47
+ normalized_data = pd .read_sql_table ('_temp_pdp_contacts_loader' , connection )
48
+
49
+ # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
50
+ log_db .log_exec_status (job_id , 'classify' , 'executing' , '' )
51
+ rows_classified = calssify_new_data .start (pdp_contacts_df , normalized_data )
52
+
53
+ # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
54
+ archive_rows .archive (connection , rows_classified ["updated" ])
55
+
56
+ # Match new+updated records against previous version of pdp_contacts database, and
57
+ # write these rows to the database.
58
+ match_data .start (connection , rows_classified , manual_matches_df , job_id )
59
+
60
+ # Copy raw input rows to json fields in pdp_contacts,
61
+ # using a temporary table to simplify the update code.
62
+ current_app .logger .info ('Saving json of original rows to pdp_contacts' )
63
+ source_json .to_sql ('_temp_pdp_contacts_loader' , connection , index = False , if_exists = 'replace' )
64
+ # https://www.postgresql.org/docs/8.4/sql-update.html
65
+ connection .execute ('''
66
+ UPDATE pdp_contacts pdp
67
+ SET json = to_json(temp.json)
68
+ FROM _temp_pdp_contacts_loader temp
69
+ WHERE
70
+ pdp.source_type = temp.source_type AND
71
+ pdp.source_id = temp.source_id AND
72
+ pdp.archived_date IS NULL
73
+ ''' )
74
+
75
+ current_app .logger .info ('Finished flow script run' )
76
+
77
+ log_db .log_exec_status (job_id , 'flow' , 'complete' , '' )
0 commit comments