Print matching status less frequently

bbucior · bbucior · commit 87f88952920b · 2021-01-19T20:11:58.000-05:00
Bucketing every 5% by default instead of row-by-row
diff --git a/src/server/pipeline/match_data.py b/src/server/pipeline/match_data.py
@@ -10,7 +10,6 @@ def start(connection, added_or_updated_rows):
     # Assigns matching ID's to records, as well.
     # WARNING: not thread-safe and could lead to concurrency issues if two users /execute simultaneously
     current_app.logger.info('Start record matching')
-    current_app.logger.warning('Matching updated records not yet handled')
     # Will need to consider updating the existing row contents (filter by active), deactivate,
     # try to match, and merge previous matching groups if applicable
     items_to_update = pd.concat([added_or_updated_rows["new"], added_or_updated_rows["updated"]], ignore_index=True)
@@ -29,15 +28,20 @@ def start(connection, added_or_updated_rows):
         del row["_id"]  # avoid specifying the _id field, so postgres will auto-increment for us
     
     rows = items_to_update.to_dict(orient="records")
+    row_print_freq = np.floor_divide(len(rows), 20)  # approx every 5%
     for row_num, row in enumerate(rows):
-        current_app.logger.info("- Matching row {} of {}".format(row_num+1, len(rows)))
+        if row_num % row_print_freq == 0:
+            current_app.logger.info("- Matching rows {}-{} of {}".format(
+                row_num+1, min(len(rows), row_num+row_print_freq), len(rows))
+            )
+        
         # Exact matches based on specified columns
         row_matches = pdp_contacts[
             (pdp_contacts["first_name"] == row["first_name"]) &
             (pdp_contacts["last_name"] == row["last_name"]) &
             (pdp_contacts["email"] == row["email"]) # TODO: could transform this line into an "or" with phone number
         ]
-        if row_matches.shape[0] == 0:  # new record, no matching rows
+        if row_matches.empty:  # new record, no matching rows
             row_group = max_matching_group
             max_matching_group += 1
         else:  # existing match(es)
@@ -51,7 +55,9 @@ def start(connection, added_or_updated_rows):
         # Updating local pdp_contacts dataframe instead of a roundtrip to postgres within the loop.
         # Indexing by iloc and vector of rows to keep the pd.DataFrame class and avoid implicit
         # casting to a single-typed pd.Series.
-        pdp_contacts = pdp_contacts.append(items_to_update.iloc[[row_num], :])
+        pdp_contacts = pdp_contacts.append(items_to_update.iloc[[row_num], :], ignore_index=True)
     
     # Write new data and matching ID's to postgres in bulk, instead of line-by-line
+    current_app.logger.info("- Writing data to pdp_contacts table")
     items_to_update.to_sql('pdp_contacts', connection, index=False, if_exists='append')
+    current_app.logger.info("- Finished load to pdp_contacts table")