Skip to content

Commit 30a5106

Browse files
committed
- fix last updated file time to local
- improve matching efficiency with standardizing column strings to lower case
1 parent b812e95 commit 30a5106

File tree

4 files changed

+19
-26
lines changed

4 files changed

+19
-26
lines changed

src/client/package-lock.json

Lines changed: 0 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/client/src/pages/Admin.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ class Admin extends Component {
146146
const fileName = file.split("-")[0];
147147
let fileDate = file.split("-").slice(1).join().split(".")[0];
148148
let fileDateOnlyNumbers = fileDate.replaceAll(",", "");
149-
let fileDateFormatted = moment(fileDateOnlyNumbers, "YYYYMMDDhmmss").local().format("MMMM Do YYYY, h:mm:ss a");
149+
let fileDateFormatted = moment.utc(fileDateOnlyNumbers, "YYYYMMDDhmmss").local().format("MMMM Do YYYY, h:mm:ss a");
150150

151151
return (
152152
<TableRow key={index}>

src/client/src/pages/DataView360/DataView360.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ class DataView360 extends Component {
202202
<Paper className={styles.main} elevation={1} style={{"padding": "1em"}}>
203203
<ContactInfo participant={_.get(this.state, 'participantData.contact_details')}/>
204204
<Grid container direction="row" justify="center">
205-
<Grid item style={{"marginTop": "1em", "position": "fixed"}}>
205+
<Grid item style={{"marginTop": "1em"}}>
206206
<Button variant="contained" color="primary"
207207
onClick={() => {
208208
this.setState({

src/server/pipeline/match_data.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,18 @@ def start(connection, added_or_updated_rows):
3939
items_to_update["archived_date"] = np.nan
4040
items_to_update["created_date"] = datetime.datetime.now()
4141

42+
# Create Normalized columns for matching
43+
items_to_update["first_name_normalized"] = items_to_update["first_name"].apply(normalize_before_match)
44+
items_to_update["last_name_normalized"] = items_to_update["last_name"].apply(normalize_before_match)
45+
items_to_update["email_normalized"] = items_to_update["email"].apply(normalize_before_match)
46+
47+
pdp_contacts["first_name_normalized"] = pdp_contacts["first_name"].apply(normalize_before_match)
48+
pdp_contacts["last_name_normalized"] = pdp_contacts["last_name"].apply(normalize_before_match)
49+
pdp_contacts["email_normalized"] = pdp_contacts["email"].apply(normalize_before_match)
50+
4251
rows = items_to_update.to_dict(orient="records")
4352
row_print_freq = max(1, np.floor_divide(len(rows), 20)) # approx every 5% (or every row if small)
53+
4454
for row_num, row in enumerate(rows):
4555
if row_num % row_print_freq == 0:
4656
current_app.logger.info("- Matching rows {}-{} of {}".format(
@@ -51,24 +61,15 @@ def start(connection, added_or_updated_rows):
5161
})
5262

5363
# Exact matches based on specified columns
54-
5564
row_matches = pdp_contacts[
5665
(
57-
((pdp_contacts["first_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
58-
row["first_name"])) &
59-
(pdp_contacts["last_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
60-
row["last_name"])))
66+
((pdp_contacts["first_name_normalized"] == row["first_name_normalized"]) &
67+
(pdp_contacts["last_name_normalized"] == row["last_name_normalized"]))
6168
|
62-
((pdp_contacts["first_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
63-
row[
64-
"last_name"])) &
65-
(pdp_contacts["last_name"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
66-
row[
67-
"first_name"])))
69+
((pdp_contacts["first_name_normalized"] == row["last_name_normalized"]) &
70+
(pdp_contacts["last_name_normalized"] == row["first_name_normalized"]))
6871
&
69-
((pdp_contacts["email"].apply(lambda x: normalize_before_match(x)) == normalize_before_match(
70-
row["email"])) | (
71-
pdp_contacts["mobile"] == row["mobile"]))
72+
((pdp_contacts["email_normalized"] == row["email_normalized"]) | (pdp_contacts["mobile"] == row["mobile"]))
7273
)
7374
]
7475
if row_matches.empty: # new record, no matching rows
@@ -89,6 +90,8 @@ def start(connection, added_or_updated_rows):
8990

9091
# Write new data and matching ID's to postgres in bulk, instead of line-by-line
9192
current_app.logger.info("- Writing data to pdp_contacts table")
93+
items_to_update = items_to_update.drop(
94+
columns=["first_name_normalized", "last_name_normalized", "email_normalized"])
9295
items_to_update.to_sql('pdp_contacts', connection, index=False, if_exists='append')
9396
current_app.logger.info("- Finished load to pdp_contacts table")
9497

0 commit comments

Comments
 (0)