5
5
from flask import current_app
6
6
7
7
8
- # todo: match and load
9
- # Compare each new and updated item to all records in the DB
10
- # (including all other items that are new and updated this iteration) - for each item:
11
- # if it matches - it will get the same matching id as the match
12
- # if it doesn't - generate matching id (some prefix with increment?)
13
- # load it with created_at = now and archived_at = null
14
-
15
8
def start (connection , added_or_updated_rows ):
16
9
# Match new records to each other and existing pdp_contacts data.
17
10
# Assigns matching ID's to records, as well.
18
11
# WARNING: not thread-safe and could lead to concurrency issues if two users /execute simultaneously
19
12
current_app .logger .info ('Start record matching' )
20
- current_app .logger .warning ('Matching updated records not yet handled' )
21
13
# Will need to consider updating the existing row contents (filter by active), deactivate,
22
14
# try to match, and merge previous matching groups if applicable
23
15
items_to_update = pd .concat ([added_or_updated_rows ["new" ], added_or_updated_rows ["updated" ]], ignore_index = True )
@@ -26,36 +18,46 @@ def start(connection, added_or_updated_rows):
26
18
if pdp_contacts ["matching_id" ].dropna ().size == 0 :
27
19
max_matching_group = 0
28
20
else :
29
- max_matching_group = max (pdp_contacts ["matching_id" ].dropna ()) + 1
21
+ max_matching_group = max (pdp_contacts ["matching_id" ].dropna ())
30
22
31
- # Iterate over the dataframe using integer index location,
32
- # because iterrows returns a type-inconsistent series, and itertuples would be more complex.
33
- num_added_or_updated = items_to_update .shape [0 ]
34
- for row_num in range (num_added_or_updated ):
35
- current_app .logger .info ("- Matching row {} of {}" .format (row_num + 1 , num_added_or_updated ))
36
- row = items_to_update .iloc [[row_num ], :].copy () # pd.DataFrame
23
+ # Initialize column metadata we'll write to pdp_contacts
24
+ items_to_update ["matching_id" ] = 0 # initializing an int and overwrite in the loop
25
+ items_to_update ["archived_date" ] = np .nan
26
+ items_to_update ["created_date" ] = datetime .datetime .now ()
27
+ if "_id" in items_to_update .columns :
28
+ del row ["_id" ] # avoid specifying the _id field, so postgres will auto-increment for us
29
+
30
+ rows = items_to_update .to_dict (orient = "records" )
31
+ row_print_freq = max (1 , np .floor_divide (len (rows ), 20 )) # approx every 5% (or every row if small)
32
+ for row_num , row in enumerate (rows ):
33
+ if row_num % row_print_freq == 0 :
34
+ current_app .logger .info ("- Matching rows {}-{} of {}" .format (
35
+ row_num + 1 , min (len (rows ), row_num + row_print_freq ), len (rows ))
36
+ )
37
+
37
38
# Exact matches based on specified columns
38
39
row_matches = pdp_contacts [
39
- (pdp_contacts ["first_name" ] == row ["first_name" ]. values [ 0 ] ) &
40
- (pdp_contacts ["last_name" ] == row ["last_name" ]. values [ 0 ] ) &
41
- (pdp_contacts ["email" ] == row ["email" ]. values [ 0 ])
40
+ (pdp_contacts ["first_name" ] == row ["first_name" ]) &
41
+ (pdp_contacts ["last_name" ] == row ["last_name" ]) &
42
+ (pdp_contacts ["email" ] == row ["email" ]) # TODO: could transform this line into an "or" with phone number
42
43
]
43
- if row_matches .shape [0 ] == 0 : # new record, no matching rows
44
- row_group = max_matching_group
44
+ if row_matches .empty : # new record, no matching rows
45
45
max_matching_group += 1
46
+ row_group = max_matching_group
46
47
else : # existing match(es)
47
48
row_group = row_matches ["matching_id" ].values [0 ]
48
49
if not all (row_matches ["matching_id" ] == row_group ):
49
50
current_app .logger .warning (
50
51
"Source {} with ID {} is matching multiple groups in pdp_contacts ({})"
51
52
.format (row ["source_type" ], row ["source_id" ], str (row_matches ["matching_id" ].drop_duplicates ()))
52
53
)
53
- row ["created_date" ] = datetime .datetime .now ()
54
- row ["archived_date" ] = np .nan
55
- row ["matching_id" ] = row_group
56
- if "_id" in row .columns :
57
- del row ["_id" ] # avoid specifying the _id field, so postgres will auto-increment for us
58
-
59
- # Round-trip to the database on every loop iteration is inefficient and could be rewritten much faster
60
- row .to_sql ('pdp_contacts' , connection , index = False , if_exists = 'append' )
61
- pdp_contacts = pd .read_sql_table ('pdp_contacts' , connection )
54
+ items_to_update .loc [row_num , "matching_id" ] = row_group
55
+ # Updating local pdp_contacts dataframe instead of a roundtrip to postgres within the loop.
56
+ # Indexing by iloc and vector of rows to keep the pd.DataFrame class and avoid implicit
57
+ # casting to a single-typed pd.Series.
58
+ pdp_contacts = pdp_contacts .append (items_to_update .iloc [[row_num ], :], ignore_index = True )
59
+
60
+ # Write new data and matching ID's to postgres in bulk, instead of line-by-line
61
+ current_app .logger .info ("- Writing data to pdp_contacts table" )
62
+ items_to_update .to_sql ('pdp_contacts' , connection , index = False , if_exists = 'append' )
63
+ current_app .logger .info ("- Finished load to pdp_contacts table" )
0 commit comments