1313from datetime import datetime
1414
1515# local environment storage
16- repo_name = ' Ballot-Initiative'
16+ repo_name = " Ballot-Initiative"
1717REPODIR = os .getcwd ()
18- load_dotenv (os .path .join (REPODIR , ' .env' ), override = True )
18+ load_dotenv (os .path .join (REPODIR , " .env" ), override = True )
1919
2020# load config
21- with open (' config.json' , 'r' ) as f :
21+ with open (" config.json" , "r" ) as f :
2222 config = json .load (f )
2323
2424# Set up logging after imports
2727 os .makedirs (log_directory )
2828
2929# Create a logger
30- logger = logging .getLogger (' fuzzy_matching' )
30+ logger = logging .getLogger (" fuzzy_matching" )
3131logger .setLevel (logging .INFO )
3232
3333# Create handlers
3636console_handler = logging .StreamHandler ()
3737
3838# Create formatters and add it to handlers
39- log_format = logging .Formatter (' %(asctime)s - %(name)s - %(levelname)s - %(message)s' )
39+ log_format = logging .Formatter (" %(asctime)s - %(name)s - %(levelname)s - %(message)s" )
4040file_handler .setFormatter (log_format )
4141console_handler .setFormatter (log_format )
4242
4848## MATCHING FUNCTIONS
4949###
5050
51- def create_select_voter_records (voter_records : pd .DataFrame ) -> pd .DataFrame :
51+
52+ def create_select_voter_records (voter_records : pd .DataFrame ) -> pd .DataFrame :
5253 """
5354 Creates a simplified DataFrame with full names and addresses from voter records.
54-
55+
5556 Args:
5657 voter_records (pd.DataFrame): DataFrame containing voter information with columns for
5758 first name, last name, and address components.
58-
59+
5960 Returns:
6061 pd.DataFrame: DataFrame with 'Full Name' and 'Full Address' columns
6162 """
6263 # Create full name by combining first and last names
6364 name_components = ["First_Name" , "Last_Name" ]
64- voter_records [name_components ] = voter_records [name_components ].fillna ('' )
65- voter_records ["Full Name" ] = voter_records [name_components ].astype (str ).agg (" " .join , axis = 1 )
65+ voter_records [name_components ] = voter_records [name_components ].fillna ("" )
66+ voter_records ["Full Name" ] = (
67+ voter_records [name_components ].astype (str ).agg (" " .join , axis = 1 )
68+ )
6669
6770 # Create full address by combining address components
68- address_components = ["Street_Number" , "Street_Name" , "Street_Type" , "Street_Dir_Suffix" ]
69- voter_records [address_components ] = voter_records [address_components ].fillna ('' )
70- voter_records ["Full Address" ] = voter_records [address_components ].astype (str ).agg (" " .join , axis = 1 )
71+ address_components = [
72+ "Street_Number" ,
73+ "Street_Name" ,
74+ "Street_Type" ,
75+ "Street_Dir_Suffix" ,
76+ ]
77+ voter_records [address_components ] = voter_records [address_components ].fillna ("" )
78+ voter_records ["Full Address" ] = (
79+ voter_records [address_components ].astype (str ).agg (" " .join , axis = 1 )
80+ )
7181
7282 # Return only the columns we need
7383 return voter_records [["Full Name" , "Full Address" ]]
7484
7585
76- def score_fuzzy_match_slim (ocr_result : str ,
77- comparison_list : List [str ],
78- scorer_ = fuzz .ratio ,
79- limit_ = 10 ) -> List [Tuple [str , int , int ]]:
86+ def score_fuzzy_match_slim (
87+ ocr_result : str , comparison_list : List [str ], scorer_ = fuzz .ratio , limit_ = 10
88+ ) -> List [Tuple [str , int , int ]]:
8089 """
8190 Scores the fuzzy match between the OCR result and the comparison list.
8291
@@ -85,76 +94,84 @@ def score_fuzzy_match_slim(ocr_result : str,
8594 comparison_list (List[str]): The list of strings to compare against.
8695 scorer_ (function): The scorer function to use.
8796 limit_ (int): The number of top matches to return.
88-
97+
8998 Returns:
9099 List[Tuple[str, int, int]]: The list of top matches with their scores and indices.
91100 """
92101 logger .debug (f"Starting fuzzy matching for: { ocr_result [:30 ]} ..." )
93-
102+
94103 # Convert to numpy array for faster operations
95104 comparison_array = np .array (comparison_list )
96-
105+
97106 # Vectorize the scorer function
98107 vectorized_scorer = np .vectorize (lambda x : scorer_ (ocr_result , x ))
99-
108+
100109 # Calculate all scores at once
101110 scores = vectorized_scorer (comparison_array )
102-
111+
103112 # Get top N indices
104113 top_indices = np .argpartition (scores , - limit_ )[- limit_ :]
105114 top_indices = top_indices [np .argsort (scores [top_indices ])[::- 1 ]]
106-
115+
107116 results = [(comparison_array [i ], scores [i ], i ) for i in top_indices ]
108117 logger .debug (f"Top match score: { results [0 ][1 ]} , Match: { results [0 ][0 ][:30 ]} ..." )
109118 return results
110119
111- def get_matched_name_address (ocr_name : str ,
112- ocr_address : str ,
113- select_voter_records : pd .DataFrame ) -> List [Tuple [str , str , float , int ]]:
120+
121+ def get_matched_name_address (
122+ ocr_name : str , ocr_address : str , select_voter_records : pd .DataFrame
123+ ) -> List [Tuple [str , str , float , int ]]:
114124 """
115125 Optimized name and address matching
116126
117127 Args:
118128 ocr_name (str): The OCR result for the name.
119129 ocr_address (str): The OCR result for the address.
120130 select_voter_records (pd.DataFrame): The DataFrame containing voter records.
121-
131+
122132 Returns:
123133 List[Tuple[str, str, float, int]]: The list of top matches with their scores and indices.
124134 """
125135 logger .debug (f"Matching - Name: { ocr_name [:30 ]} ... Address: { ocr_address [:30 ]} ..." )
126-
136+
127137 # Get name matches
128- name_matches = score_fuzzy_match_slim (ocr_name , select_voter_records ["Full Name" ].values )
138+ name_matches = score_fuzzy_match_slim (
139+ ocr_name , select_voter_records ["Full Name" ].values
140+ )
129141 logger .debug (f"Best name match score: { name_matches [0 ][1 ]} " )
130-
142+
131143 # Get address matches
132144 matched_indices = [x [2 ] for x in name_matches ]
133145 relevant_addresses = select_voter_records ["Full Address" ].values [matched_indices ]
134146 address_matches = score_fuzzy_match_slim (ocr_address , relevant_addresses )
135147 logger .debug (f"Best address match score: { address_matches [0 ][1 ]} " )
136-
148+
137149 # Calculate harmonic means
138150 name_scores = np .array ([x [1 ] for x in name_matches ])
139151 addr_scores = np .array ([x [1 ] for x in address_matches ])
140152 harmonic_means = 2 * name_scores * addr_scores / (name_scores + addr_scores )
141-
153+
142154 # Create and sort results
143- results = list (zip (
144- [x [0 ] for x in name_matches ],
145- [x [0 ] for x in address_matches ],
146- harmonic_means ,
147- matched_indices
148- ))
155+ results = list (
156+ zip (
157+ [x [0 ] for x in name_matches ],
158+ [x [0 ] for x in address_matches ],
159+ harmonic_means ,
160+ matched_indices ,
161+ )
162+ )
149163 results = sorted (results , key = lambda x : x [2 ], reverse = True )
150-
164+
151165 logger .debug (f"Best combined match score: { results [0 ][2 ]} " )
152166 return results
153167
154- def create_ocr_matched_df (ocr_df : pd .DataFrame ,
155- select_voter_records : pd .DataFrame ,
156- threshold : float = config ['BASE_THRESHOLD' ],
157- st_bar = None ) -> pd .DataFrame :
168+
169+ def create_ocr_matched_df (
170+ ocr_df : pd .DataFrame ,
171+ select_voter_records : pd .DataFrame ,
172+ threshold : float = config ["BASE_THRESHOLD" ],
173+ st_bar = None ,
174+ ) -> pd .DataFrame :
158175 """
159176 Creates a DataFrame with matched name and address.
160177
@@ -163,59 +180,80 @@ def create_ocr_matched_df(ocr_df : pd.DataFrame,
163180 select_voter_records (pd.DataFrame): The DataFrame containing voter records.
164181 threshold (float): The threshold for matching.
165182 st_bar (st.progress): The progress bar to display.
166-
183+
167184 Returns:
168185 pd.DataFrame: The DataFrame with matched name and address.
169186 """
170- logger .info (f"Starting matching process for { len (ocr_df )} records with threshold { threshold } " )
171-
187+ logger .info (
188+ f"Starting matching process for { len (ocr_df )} records with threshold { threshold } "
189+ )
190+
172191 # Process in batches for better memory management
173192 batch_size = 1000
174193 results = []
175-
194+
176195 for batch_start in tqdm (range (0 , len (ocr_df ), batch_size )):
177- batch = ocr_df .iloc [batch_start :batch_start + batch_size ]
178- logger .info (f"Processing batch { batch_start // batch_size + 1 } , rows { batch_start } to { min (batch_start + batch_size , len (ocr_df ))} " )
179-
196+ batch = ocr_df .iloc [batch_start : batch_start + batch_size ]
197+ logger .info (
198+ f"Processing batch { batch_start // batch_size + 1 } , rows { batch_start } to { min (batch_start + batch_size , len (ocr_df ))} "
199+ )
200+
180201 # Process batch in parallel
181202 with ThreadPoolExecutor () as executor :
182- batch_results = list (executor . map (
183- lambda row : get_matched_name_address (
184- row [ "OCR Name" ],
185- row ["OCR Address" ],
186- select_voter_records
187- ) ,
188- [ row for _ , row in batch . iterrows ()]
189- ))
190-
203+ batch_results = list (
204+ executor . map (
205+ lambda row : get_matched_name_address (
206+ row ["OCR Name" ], row [ "OCR Address" ], select_voter_records
207+ ),
208+ [ row for _ , row in batch . iterrows ()] ,
209+ )
210+ )
211+
191212 # Extract best matches
192213 batch_matches = [(res [0 ][0 ], res [0 ][1 ], res [0 ][2 ]) for res in batch_results ]
193214 results .extend (batch_matches )
194-
215+
195216 # Log batch statistics
196217 batch_scores = [match [2 ] for match in batch_matches ]
197- logger .info (f"Batch statistics - Avg score: { np .mean (batch_scores ):.2f} , "
198- f"Min score: { min (batch_scores ):.2f} , "
199- f"Max score: { max (batch_scores ):.2f} , "
200- f"Valid matches: { sum (score >= threshold for score in batch_scores )} " )
218+ logger .info (
219+ f"Batch statistics - Avg score: { np .mean (batch_scores ):.2f} , "
220+ f"Min score: { min (batch_scores ):.2f} , "
221+ f"Max score: { max (batch_scores ):.2f} , "
222+ f"Valid matches: { sum (score >= threshold for score in batch_scores )} "
223+ )
201224
202225 if st_bar :
203- st_bar .progress (batch_start / len (ocr_df ), text = f"Processing batch { batch_start } out of { len (ocr_df )// batch_size + 1 } batches" )
204-
226+ st_bar .progress (
227+ batch_start / len (ocr_df ),
228+ text = f"Processing batch { batch_start } out of { len (ocr_df ) // batch_size + 1 } batches" ,
229+ )
230+
205231 logger .info ("Creating final DataFrame" )
206- match_df = pd .DataFrame (results , columns = ["Matched Name" , "Matched Address" , "Match Score" ])
232+ match_df = pd .DataFrame (
233+ results , columns = ["Matched Name" , "Matched Address" , "Match Score" ]
234+ )
207235 result_df = pd .concat ([ocr_df , match_df ], axis = 1 )
208236 result_df ["Valid" ] = result_df ["Match Score" ] >= threshold
209-
237+
210238 # Reorder columns
211239 column_order = [
212- "OCR Name" , "OCR Address" , "Matched Name" , "Matched Address" ,
213- "Date" , "Match Score" , "Valid" , "Page Number" , "Row Number" , "Filename"
240+ "OCR Name" ,
241+ "OCR Address" ,
242+ "Matched Name" ,
243+ "Matched Address" ,
244+ "Date" ,
245+ "Match Score" ,
246+ "Valid" ,
247+ "Page Number" ,
248+ "Row Number" ,
249+ "Filename" ,
214250 ]
215-
251+
216252 # Log final statistics
217253 total_valid = result_df ["Valid" ].sum ()
218- logger .info (f"Matching complete - Total records: { len (result_df )} , "
219- f"Valid matches: { total_valid } ({ total_valid / len (result_df )* 100 :.1f} %)" )
220-
221- return result_df [column_order ]
254+ logger .info (
255+ f"Matching complete - Total records: { len (result_df )} , "
256+ f"Valid matches: { total_valid } ({ total_valid / len (result_df ) * 100 :.1f} %)"
257+ )
258+
259+ return result_df [column_order ]
0 commit comments