@@ -10,7 +10,6 @@ def start(connection, added_or_updated_rows):
10
10
# Assigns matching ID's to records, as well.
11
11
# WARNING: not thread-safe and could lead to concurrency issues if two users /execute simultaneously
12
12
current_app .logger .info ('Start record matching' )
13
- current_app .logger .warning ('Matching updated records not yet handled' )
14
13
# Will need to consider updating the existing row contents (filter by active), deactivate,
15
14
# try to match, and merge previous matching groups if applicable
16
15
items_to_update = pd .concat ([added_or_updated_rows ["new" ], added_or_updated_rows ["updated" ]], ignore_index = True )
@@ -29,15 +28,20 @@ def start(connection, added_or_updated_rows):
29
28
del row ["_id" ] # avoid specifying the _id field, so postgres will auto-increment for us
30
29
31
30
rows = items_to_update .to_dict (orient = "records" )
31
+ row_print_freq = np .floor_divide (len (rows ), 20 ) # approx every 5%
32
32
for row_num , row in enumerate (rows ):
33
- current_app .logger .info ("- Matching row {} of {}" .format (row_num + 1 , len (rows )))
33
+ if row_num % row_print_freq == 0 :
34
+ current_app .logger .info ("- Matching rows {}-{} of {}" .format (
35
+ row_num + 1 , min (len (rows ), row_num + row_print_freq ), len (rows ))
36
+ )
37
+
34
38
# Exact matches based on specified columns
35
39
row_matches = pdp_contacts [
36
40
(pdp_contacts ["first_name" ] == row ["first_name" ]) &
37
41
(pdp_contacts ["last_name" ] == row ["last_name" ]) &
38
42
(pdp_contacts ["email" ] == row ["email" ]) # TODO: could transform this line into an "or" with phone number
39
43
]
40
- if row_matches .shape [ 0 ] == 0 : # new record, no matching rows
44
+ if row_matches .empty : # new record, no matching rows
41
45
row_group = max_matching_group
42
46
max_matching_group += 1
43
47
else : # existing match(es)
@@ -51,7 +55,9 @@ def start(connection, added_or_updated_rows):
51
55
# Updating local pdp_contacts dataframe instead of a roundtrip to postgres within the loop.
52
56
# Indexing by iloc and vector of rows to keep the pd.DataFrame class and avoid implicit
53
57
# casting to a single-typed pd.Series.
54
- pdp_contacts = pdp_contacts .append (items_to_update .iloc [[row_num ], :])
58
+ pdp_contacts = pdp_contacts .append (items_to_update .iloc [[row_num ], :], ignore_index = True )
55
59
56
60
# Write new data and matching ID's to postgres in bulk, instead of line-by-line
61
+ current_app .logger .info ("- Writing data to pdp_contacts table" )
57
62
items_to_update .to_sql ('pdp_contacts' , connection , index = False , if_exists = 'append' )
63
+ current_app .logger .info ("- Finished load to pdp_contacts table" )
0 commit comments