lower case before matching

urirot · urirot · commit b812e952adf8 · 2021-03-30T20:11:05.000-04:00
add function for string cleaning in matching
diff --git a/src/server/pipeline/match_data.py b/src/server/pipeline/match_data.py
@@ -6,6 +6,14 @@
 from pipeline import log_db
 
 
+def normalize_before_match(value):
+    result = None
+
+    if isinstance(value, str):
+        result = value.lower()
+
+    return result
+
 
 def start(connection, added_or_updated_rows):
     # Match new records to each other and existing pdp_contacts data.
@@ -16,7 +24,7 @@ def start(connection, added_or_updated_rows):
     # try to match, and merge previous matching groups if applicable
 
     job_id = str(int(time.time()))
-    log_db.log_exec_status(job_id,{'status': 'starting', 'at_row':  0, 'of_rows':0})
+    log_db.log_exec_status(job_id, {'status': 'starting', 'at_row': 0, 'of_rows': 0})
     current_app.logger.info("***** Running execute job ID " + job_id + " *****")
     items_to_update = pd.concat([added_or_updated_rows["new"], added_or_updated_rows["updated"]], ignore_index=True)
     pdp_contacts = pd.read_sql_table('pdp_contacts', connection)
@@ -36,22 +44,32 @@ def start(connection, added_or_updated_rows):
     for row_num, row in enumerate(rows):
         if row_num % row_print_freq == 0:
             current_app.logger.info("- Matching rows {}-{} of {}".format(
-                row_num+1, min(len(rows), row_num+row_print_freq), len(rows))
+                row_num + 1, min(len(rows), row_num + row_print_freq), len(rows))
             )
-            log_db.log_exec_status(job_id,{'status': 'executing', 'at_row':  row_num+1, 'of_rows':len(rows)})
+            log_db.log_exec_status(job_id, {
+                'status': 'executing', 'at_row': row_num + 1, 'of_rows': len(rows)
+            })
 
         # Exact matches based on specified columns
+
         row_matches = pdp_contacts[
             (
-                    ((pdp_contacts["first_name"] == row["first_name"]) &
-                    (pdp_contacts["last_name"] == row["last_name"]))
+                    ((pdp_contacts["first_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
+                        row["first_name"])) &
+                     (pdp_contacts["last_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
+                         row["last_name"])))
                     |
-                    ((pdp_contacts["first_name"] == row["last_name"]) &
-                    (pdp_contacts["last_name"] == row["first_name"]))
+                    ((pdp_contacts["first_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
+                        row[
+                            "last_name"])) &
+                     (pdp_contacts["last_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
+                         row[
+                             "first_name"])))
                     &
-                    ((pdp_contacts["email"] == row["email"]) | (pdp_contacts["mobile"] == row["mobile"]))
+                    ((pdp_contacts["email"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
+                        row["email"])) | (
+                             pdp_contacts["mobile"] == row["mobile"]))
             )
-
         ]
         if row_matches.empty:  # new record, no matching rows
             max_matching_group += 1
@@ -74,4 +92,4 @@ def start(connection, added_or_updated_rows):
     items_to_update.to_sql('pdp_contacts', connection, index=False, if_exists='append')
     current_app.logger.info("- Finished load to pdp_contacts table")
 
-    log_db.log_exec_status(job_id,{'status': 'complete', 'at_row':  len(rows), 'of_rows':len(rows)})
+    log_db.log_exec_status(job_id, {'status': 'complete', 'at_row': len(rows), 'of_rows': len(rows)})