Skip to content

Commit b812e95

Browse files
committed
lower case before matching
add function for string cleaning in matching
1 parent fc750e2 commit b812e95

File tree

1 file changed

+28
-10
lines changed

1 file changed

+28
-10
lines changed

src/server/pipeline/match_data.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@
66
from pipeline import log_db
77

88

9+
def normalize_before_match(value):
10+
result = None
11+
12+
if isinstance(value, str):
13+
result = value.lower()
14+
15+
return result
16+
917

1018
def start(connection, added_or_updated_rows):
1119
# Match new records to each other and existing pdp_contacts data.
@@ -16,7 +24,7 @@ def start(connection, added_or_updated_rows):
1624
# try to match, and merge previous matching groups if applicable
1725

1826
job_id = str(int(time.time()))
19-
log_db.log_exec_status(job_id,{'status': 'starting', 'at_row': 0, 'of_rows':0})
27+
log_db.log_exec_status(job_id, {'status': 'starting', 'at_row': 0, 'of_rows': 0})
2028
current_app.logger.info("***** Running execute job ID " + job_id + " *****")
2129
items_to_update = pd.concat([added_or_updated_rows["new"], added_or_updated_rows["updated"]], ignore_index=True)
2230
pdp_contacts = pd.read_sql_table('pdp_contacts', connection)
@@ -36,22 +44,32 @@ def start(connection, added_or_updated_rows):
3644
for row_num, row in enumerate(rows):
3745
if row_num % row_print_freq == 0:
3846
current_app.logger.info("- Matching rows {}-{} of {}".format(
39-
row_num+1, min(len(rows), row_num+row_print_freq), len(rows))
47+
row_num + 1, min(len(rows), row_num + row_print_freq), len(rows))
4048
)
41-
log_db.log_exec_status(job_id,{'status': 'executing', 'at_row': row_num+1, 'of_rows':len(rows)})
49+
log_db.log_exec_status(job_id, {
50+
'status': 'executing', 'at_row': row_num + 1, 'of_rows': len(rows)
51+
})
4252

4353
# Exact matches based on specified columns
54+
4455
row_matches = pdp_contacts[
4556
(
46-
((pdp_contacts["first_name"] == row["first_name"]) &
47-
(pdp_contacts["last_name"] == row["last_name"]))
57+
((pdp_contacts["first_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
58+
row["first_name"])) &
59+
(pdp_contacts["last_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
60+
row["last_name"])))
4861
|
49-
((pdp_contacts["first_name"] == row["last_name"]) &
50-
(pdp_contacts["last_name"] == row["first_name"]))
62+
((pdp_contacts["first_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
63+
row[
64+
"last_name"])) &
65+
(pdp_contacts["last_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
66+
row[
67+
"first_name"])))
5168
&
52-
((pdp_contacts["email"] == row["email"]) | (pdp_contacts["mobile"] == row["mobile"]))
69+
((pdp_contacts["email"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
70+
row["email"])) | (
71+
pdp_contacts["mobile"] == row["mobile"]))
5372
)
54-
5573
]
5674
if row_matches.empty: # new record, no matching rows
5775
max_matching_group += 1
@@ -74,4 +92,4 @@ def start(connection, added_or_updated_rows):
7492
items_to_update.to_sql('pdp_contacts', connection, index=False, if_exists='append')
7593
current_app.logger.info("- Finished load to pdp_contacts table")
7694

77-
log_db.log_exec_status(job_id,{'status': 'complete', 'at_row': len(rows), 'of_rows':len(rows)})
95+
log_db.log_exec_status(job_id, {'status': 'complete', 'at_row': len(rows), 'of_rows': len(rows)})

0 commit comments

Comments
 (0)