Merge pull request #275 from CodeForPhilly/manual_matches

sposerina · web-flow · commit 4681add9e717 · 2021-05-04T20:25:30.000-04:00
Manual matches
diff --git a/src/server/datasource_manager.py b/src/server/datasource_manager.py
@@ -21,7 +21,8 @@ def __clean_csv_headers(header):
                             'Account ID (18 digit)',
                             'Opportunity Name', 'Stage', 'Fiscal Period', 'Amount', 'Probability (%)', 'Age',
                             'Close Date', 'Created Date', 'Type', 'Primary Campaign Source',
-                            'Source', 'Contact ID (18 Digit)', 'Primary Contact']
+                            'Source', 'Contact ID (18 Digit)', 'Primary Contact'],
+    'manualmatches': ['salesforcecontacts', 'volgistics', 'shelterluvpeople']
 }
 
 DATASOURCE_MAPPING = {
diff --git a/src/server/pipeline/clean_and_load_data.py b/src/server/pipeline/clean_and_load_data.py
@@ -13,10 +13,16 @@
 def start(connection, pdp_contacts_df, file_path_list):
     result = pd.DataFrame(columns=pdp_contacts_df.columns)
     json_rows = pd.DataFrame(columns=["source_type", "source_id", "json"])
-
+    manual_matches_df = None
+    
     for uploaded_file in file_path_list:
         file_path = os.path.join(CURRENT_SOURCE_FILES_PATH, uploaded_file)
         table_name = file_path.split('/')[-1].split('-')[0]
+        if table_name == 'manualmatches':
+            manual_matches_df = pd.read_csv((io.BytesIO(open(file_path, "rb").read())), encoding='iso-8859-1')
+            manual_matches_df[["volgistics", "shelterluvpeople"]] = manual_matches_df[["volgistics", "shelterluvpeople"]].fillna(0).astype(int).astype(str)
+            continue
+            
         current_app.logger.info('Running load_paws_data on: ' + uploaded_file)
 
         df = pd.read_csv((io.BytesIO(open(file_path, "rb").read())), encoding='iso-8859-1')
@@ -54,7 +60,7 @@ def start(connection, pdp_contacts_df, file_path_list):
 
         current_app.logger.info('   - Finish load_paws_data on: ' + uploaded_file)
 
-    return result, json_rows
+    return result, json_rows, manual_matches_df
 
 
 def create_normalized_df(df, normalized_df, table_name):
diff --git a/src/server/pipeline/flow_script.py b/src/server/pipeline/flow_script.py
@@ -25,7 +25,7 @@ def start_flow():
             # Populate new records in secondary tables (donations, volunteer shifts)
             # input - existing files in path
             # output - normalized object of all entries, as well as the input json rows for primary sources
-            normalized_data, source_json = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
+            normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
 
             # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
             # (If additional inconsistencies are encountered, may need to enforce the schema of
@@ -41,7 +41,7 @@ def start_flow():
 
             # Match new+updated records against previous version of pdp_contacts database, and
             # write these rows to the database.
-            match_data.start(connection, rows_classified)
+            match_data.start(connection, rows_classified, manual_matches_df)
 
             # Copy raw input rows to json fields in pdp_contacts,
             # using a temporary table to simplify the update code.
diff --git a/src/server/pipeline/match_data.py b/src/server/pipeline/match_data.py
@@ -15,14 +15,13 @@ def normalize_before_match(value):
     return result
 
 
-def start(connection, added_or_updated_rows):
+def start(connection, added_or_updated_rows, manual_matches_df):
     # Match new records to each other and existing pdp_contacts data.
     # Assigns matching ID's to records, as well.
     # WARNING: not thread-safe and could lead to concurrency issues if two users /execute simultaneously
     current_app.logger.info('Start record matching')
     # Will need to consider updating the existing row contents (filter by active), deactivate,
     # try to match, and merge previous matching groups if applicable
-
     job_id = str(int(time.time()))
     log_db.log_exec_status(job_id, {'status': 'starting', 'at_row': 0, 'of_rows': 0})
     current_app.logger.info("***** Running execute job ID " + job_id + " *****")
@@ -63,15 +62,24 @@ def start(connection, added_or_updated_rows):
         # Exact matches based on specified columns
         row_matches = pdp_contacts[
             (
-                    ((pdp_contacts["first_name_normalized"] == row["first_name_normalized"]) &
-                     (pdp_contacts["last_name_normalized"] == row["last_name_normalized"]))
-                    |
-                    ((pdp_contacts["first_name_normalized"] == row["last_name_normalized"]) &
-                     (pdp_contacts["last_name_normalized"] == row["first_name_normalized"]))
-                    &
-                    ((pdp_contacts["email_normalized"] == row["email_normalized"]) | (pdp_contacts["mobile"] == row["mobile"]))
+                ((pdp_contacts["first_name_normalized"] == row["first_name_normalized"]) &
+                (pdp_contacts["last_name_normalized"] == row["last_name_normalized"]))
+                |
+                ((pdp_contacts["first_name_normalized"] == row["last_name_normalized"]) &
+                (pdp_contacts["last_name_normalized"] == row["first_name_normalized"]))
+                &
+                ((pdp_contacts["email_normalized"] == row["email_normalized"]) | (pdp_contacts["mobile"] == row["mobile"]))
             )
         ]
+        #collect other linked ids from manual matches source
+        if manual_matches_df != None:
+            linked_ids = manual_matches_df[(manual_matches_df[row["source_type"]] == row["source_id"])]
+            ids = linked_ids.to_dict(orient="records")
+            for row_dict in enumerate(ids):
+                for column, value in row_dict.items():
+                    row_matches = row_matches.append(pdp_contacts[(pdp_contacts["source_type"] == column) & (pdp_contacts["source_id"] == value)])
+
+        
         if row_matches.empty:  # new record, no matching rows
             max_matching_group += 1
             row_group = max_matching_group

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,8 @@ def __clean_csv_headers(header):`
`21`	`21`	`'Account ID (18 digit)',`
`22`	`22`	`'Opportunity Name', 'Stage', 'Fiscal Period', 'Amount', 'Probability (%)', 'Age',`
`23`	`23`	`'Close Date', 'Created Date', 'Type', 'Primary Campaign Source',`
`24`		`- 'Source', 'Contact ID (18 Digit)', 'Primary Contact']`
	`24`	`+ 'Source', 'Contact ID (18 Digit)', 'Primary Contact'],`
	`25`	`+ 'manualmatches': ['salesforcecontacts', 'volgistics', 'shelterluvpeople']`
`25`	`26`	`}`
`26`	`27`
`27`	`28`	`DATASOURCE_MAPPING = {`