Skip to content

Commit 19a752a

Browse files
committed
run ruff check and format
1 parent d935bfe commit 19a752a

File tree

8 files changed

+332
-221
lines changed

8 files changed

+332
-221
lines changed

app/Home.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,11 @@
2020
"""
2121
)
2222

23-
st.image("app/ballot_initiative_schematic.png", caption="Core process for validating signatures", use_container_width=True)
23+
st.image(
24+
"app/ballot_initiative_schematic.png",
25+
caption="Core process for validating signatures",
26+
use_container_width=True,
27+
)
2428

2529
st.markdown(
2630
"""
@@ -47,6 +51,6 @@
4751
"© 2025 Ballot Initiative Project | "
4852
# "<a href='#'>Privacy Policy</a> | "
4953
"<a href='https://github.com/Civic-Tech-Ballot-Inititiave/Ballot-Initiative/blob/main/LICENSE.md'>Terms of Use</a>"
50-
"</div>",
51-
unsafe_allow_html=True
54+
"</div>",
55+
unsafe_allow_html=True,
5256
)

app/api.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
import os
2-
import time
32

4-
from fastapi import FastAPI, Request, Response
3+
from fastapi import FastAPI, Response
54
from fastapi.middleware.cors import CORSMiddleware
65
from fuzzy_match_helper import create_ocr_matched_df, create_select_voter_records
7-
from ocr_helper import create_ocr_df, log_filename
6+
from ocr_helper import create_ocr_df
87
from routers import file
98
from settings.settings_repo import config
10-
from sh import tail
11-
from sse_starlette.sse import EventSourceResponse
129
from utils import logger
1310

1411
app = FastAPI(root_path="/api")
@@ -29,12 +26,13 @@
2926

3027
app.include_router(file.router)
3128

29+
3230
@app.post("/ocr", tags=["OCR"])
3331
def ocr(response: Response):
3432
"""
3533
Triggers the OCR process on the uploaded petition signatures PDF file.
3634
"""
37-
if not os.path.exists('temp/ballot.pdf'):
35+
if not os.path.exists("temp/ballot.pdf"):
3836
logger.error("No PDF file found for petition signatures")
3937
response.status_code = 400
4038
return {"error": "No PDF file found for petition signatures"}
@@ -45,21 +43,17 @@ def ocr(response: Response):
4543
logger.info("Starting OCR processing...")
4644
# Process files if in processing state
4745
logger.info("Converting PDF to images...")
48-
49-
ocr_df = create_ocr_df(filedir='temp',
50-
filename='ballot.pdf')
51-
46+
47+
ocr_df = create_ocr_df(filedir="temp", filename="ballot.pdf")
48+
5249
logger.info("Compiling Voter Record Data...")
5350

5451
select_voter_records = create_select_voter_records(app.state.voter_records_df)
55-
52+
5653
logger.info("Matching petition signatures to voter records...")
5754

5855
ocr_matched_df = create_ocr_matched_df(
59-
ocr_df,
60-
select_voter_records,
61-
threshold=config['BASE_THRESHOLD']
56+
ocr_df, select_voter_records, threshold=config["BASE_THRESHOLD"]
6257
)
63-
response.headers['Content-Type'] = 'application/json'
64-
return {'data': ocr_matched_df.to_dict(orient='records'), 'stats': {}}
65-
58+
response.headers["Content-Type"] = "application/json"
59+
return {"data": ocr_matched_df.to_dict(orient="records"), "stats": {}}

app/fuzzy_match_helper.py

Lines changed: 113 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@
1313
from datetime import datetime
1414

1515
# local environment storage
16-
repo_name = 'Ballot-Initiative'
16+
repo_name = "Ballot-Initiative"
1717
REPODIR = os.getcwd()
18-
load_dotenv(os.path.join(REPODIR, '.env'), override=True)
18+
load_dotenv(os.path.join(REPODIR, ".env"), override=True)
1919

2020
# load config
21-
with open('config.json', 'r') as f:
21+
with open("config.json", "r") as f:
2222
config = json.load(f)
2323

2424
# Set up logging after imports
@@ -27,7 +27,7 @@
2727
os.makedirs(log_directory)
2828

2929
# Create a logger
30-
logger = logging.getLogger('fuzzy_matching')
30+
logger = logging.getLogger("fuzzy_matching")
3131
logger.setLevel(logging.INFO)
3232

3333
# Create handlers
@@ -36,7 +36,7 @@
3636
console_handler = logging.StreamHandler()
3737

3838
# Create formatters and add it to handlers
39-
log_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
39+
log_format = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
4040
file_handler.setFormatter(log_format)
4141
console_handler.setFormatter(log_format)
4242

@@ -48,35 +48,44 @@
4848
## MATCHING FUNCTIONS
4949
###
5050

51-
def create_select_voter_records(voter_records : pd.DataFrame) -> pd.DataFrame:
51+
52+
def create_select_voter_records(voter_records: pd.DataFrame) -> pd.DataFrame:
5253
"""
5354
Creates a simplified DataFrame with full names and addresses from voter records.
54-
55+
5556
Args:
5657
voter_records (pd.DataFrame): DataFrame containing voter information with columns for
5758
first name, last name, and address components.
58-
59+
5960
Returns:
6061
pd.DataFrame: DataFrame with 'Full Name' and 'Full Address' columns
6162
"""
6263
# Create full name by combining first and last names
6364
name_components = ["First_Name", "Last_Name"]
64-
voter_records[name_components] = voter_records[name_components].fillna('')
65-
voter_records["Full Name"] = voter_records[name_components].astype(str).agg(" ".join, axis=1)
65+
voter_records[name_components] = voter_records[name_components].fillna("")
66+
voter_records["Full Name"] = (
67+
voter_records[name_components].astype(str).agg(" ".join, axis=1)
68+
)
6669

6770
# Create full address by combining address components
68-
address_components = ["Street_Number", "Street_Name", "Street_Type", "Street_Dir_Suffix"]
69-
voter_records[address_components] = voter_records[address_components].fillna('')
70-
voter_records["Full Address"] = voter_records[address_components].astype(str).agg(" ".join, axis=1)
71+
address_components = [
72+
"Street_Number",
73+
"Street_Name",
74+
"Street_Type",
75+
"Street_Dir_Suffix",
76+
]
77+
voter_records[address_components] = voter_records[address_components].fillna("")
78+
voter_records["Full Address"] = (
79+
voter_records[address_components].astype(str).agg(" ".join, axis=1)
80+
)
7181

7282
# Return only the columns we need
7383
return voter_records[["Full Name", "Full Address"]]
7484

7585

76-
def score_fuzzy_match_slim(ocr_result : str,
77-
comparison_list : List[str],
78-
scorer_=fuzz.ratio,
79-
limit_=10) -> List[Tuple[str, int, int]]:
86+
def score_fuzzy_match_slim(
87+
ocr_result: str, comparison_list: List[str], scorer_=fuzz.ratio, limit_=10
88+
) -> List[Tuple[str, int, int]]:
8089
"""
8190
Scores the fuzzy match between the OCR result and the comparison list.
8291
@@ -85,76 +94,84 @@ def score_fuzzy_match_slim(ocr_result : str,
8594
comparison_list (List[str]): The list of strings to compare against.
8695
scorer_ (function): The scorer function to use.
8796
limit_ (int): The number of top matches to return.
88-
97+
8998
Returns:
9099
List[Tuple[str, int, int]]: The list of top matches with their scores and indices.
91100
"""
92101
logger.debug(f"Starting fuzzy matching for: {ocr_result[:30]}...")
93-
102+
94103
# Convert to numpy array for faster operations
95104
comparison_array = np.array(comparison_list)
96-
105+
97106
# Vectorize the scorer function
98107
vectorized_scorer = np.vectorize(lambda x: scorer_(ocr_result, x))
99-
108+
100109
# Calculate all scores at once
101110
scores = vectorized_scorer(comparison_array)
102-
111+
103112
# Get top N indices
104113
top_indices = np.argpartition(scores, -limit_)[-limit_:]
105114
top_indices = top_indices[np.argsort(scores[top_indices])[::-1]]
106-
115+
107116
results = [(comparison_array[i], scores[i], i) for i in top_indices]
108117
logger.debug(f"Top match score: {results[0][1]}, Match: {results[0][0][:30]}...")
109118
return results
110119

111-
def get_matched_name_address(ocr_name : str,
112-
ocr_address : str,
113-
select_voter_records : pd.DataFrame) -> List[Tuple[str, str, float, int]]:
120+
121+
def get_matched_name_address(
122+
ocr_name: str, ocr_address: str, select_voter_records: pd.DataFrame
123+
) -> List[Tuple[str, str, float, int]]:
114124
"""
115125
Optimized name and address matching
116126
117127
Args:
118128
ocr_name (str): The OCR result for the name.
119129
ocr_address (str): The OCR result for the address.
120130
select_voter_records (pd.DataFrame): The DataFrame containing voter records.
121-
131+
122132
Returns:
123133
List[Tuple[str, str, float, int]]: The list of top matches with their scores and indices.
124134
"""
125135
logger.debug(f"Matching - Name: {ocr_name[:30]}... Address: {ocr_address[:30]}...")
126-
136+
127137
# Get name matches
128-
name_matches = score_fuzzy_match_slim(ocr_name, select_voter_records["Full Name"].values)
138+
name_matches = score_fuzzy_match_slim(
139+
ocr_name, select_voter_records["Full Name"].values
140+
)
129141
logger.debug(f"Best name match score: {name_matches[0][1]}")
130-
142+
131143
# Get address matches
132144
matched_indices = [x[2] for x in name_matches]
133145
relevant_addresses = select_voter_records["Full Address"].values[matched_indices]
134146
address_matches = score_fuzzy_match_slim(ocr_address, relevant_addresses)
135147
logger.debug(f"Best address match score: {address_matches[0][1]}")
136-
148+
137149
# Calculate harmonic means
138150
name_scores = np.array([x[1] for x in name_matches])
139151
addr_scores = np.array([x[1] for x in address_matches])
140152
harmonic_means = 2 * name_scores * addr_scores / (name_scores + addr_scores)
141-
153+
142154
# Create and sort results
143-
results = list(zip(
144-
[x[0] for x in name_matches],
145-
[x[0] for x in address_matches],
146-
harmonic_means,
147-
matched_indices
148-
))
155+
results = list(
156+
zip(
157+
[x[0] for x in name_matches],
158+
[x[0] for x in address_matches],
159+
harmonic_means,
160+
matched_indices,
161+
)
162+
)
149163
results = sorted(results, key=lambda x: x[2], reverse=True)
150-
164+
151165
logger.debug(f"Best combined match score: {results[0][2]}")
152166
return results
153167

154-
def create_ocr_matched_df(ocr_df : pd.DataFrame,
155-
select_voter_records : pd.DataFrame,
156-
threshold : float = config['BASE_THRESHOLD'],
157-
st_bar = None) -> pd.DataFrame:
168+
169+
def create_ocr_matched_df(
170+
ocr_df: pd.DataFrame,
171+
select_voter_records: pd.DataFrame,
172+
threshold: float = config["BASE_THRESHOLD"],
173+
st_bar=None,
174+
) -> pd.DataFrame:
158175
"""
159176
Creates a DataFrame with matched name and address.
160177
@@ -163,59 +180,80 @@ def create_ocr_matched_df(ocr_df : pd.DataFrame,
163180
select_voter_records (pd.DataFrame): The DataFrame containing voter records.
164181
threshold (float): The threshold for matching.
165182
st_bar (st.progress): The progress bar to display.
166-
183+
167184
Returns:
168185
pd.DataFrame: The DataFrame with matched name and address.
169186
"""
170-
logger.info(f"Starting matching process for {len(ocr_df)} records with threshold {threshold}")
171-
187+
logger.info(
188+
f"Starting matching process for {len(ocr_df)} records with threshold {threshold}"
189+
)
190+
172191
# Process in batches for better memory management
173192
batch_size = 1000
174193
results = []
175-
194+
176195
for batch_start in tqdm(range(0, len(ocr_df), batch_size)):
177-
batch = ocr_df.iloc[batch_start:batch_start + batch_size]
178-
logger.info(f"Processing batch {batch_start//batch_size + 1}, rows {batch_start} to {min(batch_start + batch_size, len(ocr_df))}")
179-
196+
batch = ocr_df.iloc[batch_start : batch_start + batch_size]
197+
logger.info(
198+
f"Processing batch {batch_start // batch_size + 1}, rows {batch_start} to {min(batch_start + batch_size, len(ocr_df))}"
199+
)
200+
180201
# Process batch in parallel
181202
with ThreadPoolExecutor() as executor:
182-
batch_results = list(executor.map(
183-
lambda row: get_matched_name_address(
184-
row["OCR Name"],
185-
row["OCR Address"],
186-
select_voter_records
187-
),
188-
[row for _, row in batch.iterrows()]
189-
))
190-
203+
batch_results = list(
204+
executor.map(
205+
lambda row: get_matched_name_address(
206+
row["OCR Name"], row["OCR Address"], select_voter_records
207+
),
208+
[row for _, row in batch.iterrows()],
209+
)
210+
)
211+
191212
# Extract best matches
192213
batch_matches = [(res[0][0], res[0][1], res[0][2]) for res in batch_results]
193214
results.extend(batch_matches)
194-
215+
195216
# Log batch statistics
196217
batch_scores = [match[2] for match in batch_matches]
197-
logger.info(f"Batch statistics - Avg score: {np.mean(batch_scores):.2f}, "
198-
f"Min score: {min(batch_scores):.2f}, "
199-
f"Max score: {max(batch_scores):.2f}, "
200-
f"Valid matches: {sum(score >= threshold for score in batch_scores)}")
218+
logger.info(
219+
f"Batch statistics - Avg score: {np.mean(batch_scores):.2f}, "
220+
f"Min score: {min(batch_scores):.2f}, "
221+
f"Max score: {max(batch_scores):.2f}, "
222+
f"Valid matches: {sum(score >= threshold for score in batch_scores)}"
223+
)
201224

202225
if st_bar:
203-
st_bar.progress(batch_start / len(ocr_df), text=f"Processing batch {batch_start} out of {len(ocr_df)//batch_size+1} batches")
204-
226+
st_bar.progress(
227+
batch_start / len(ocr_df),
228+
text=f"Processing batch {batch_start} out of {len(ocr_df) // batch_size + 1} batches",
229+
)
230+
205231
logger.info("Creating final DataFrame")
206-
match_df = pd.DataFrame(results, columns=["Matched Name", "Matched Address", "Match Score"])
232+
match_df = pd.DataFrame(
233+
results, columns=["Matched Name", "Matched Address", "Match Score"]
234+
)
207235
result_df = pd.concat([ocr_df, match_df], axis=1)
208236
result_df["Valid"] = result_df["Match Score"] >= threshold
209-
237+
210238
# Reorder columns
211239
column_order = [
212-
"OCR Name", "OCR Address", "Matched Name", "Matched Address",
213-
"Date", "Match Score", "Valid", "Page Number", "Row Number", "Filename"
240+
"OCR Name",
241+
"OCR Address",
242+
"Matched Name",
243+
"Matched Address",
244+
"Date",
245+
"Match Score",
246+
"Valid",
247+
"Page Number",
248+
"Row Number",
249+
"Filename",
214250
]
215-
251+
216252
# Log final statistics
217253
total_valid = result_df["Valid"].sum()
218-
logger.info(f"Matching complete - Total records: {len(result_df)}, "
219-
f"Valid matches: {total_valid} ({total_valid/len(result_df)*100:.1f}%)")
220-
221-
return result_df[column_order]
254+
logger.info(
255+
f"Matching complete - Total records: {len(result_df)}, "
256+
f"Valid matches: {total_valid} ({total_valid / len(result_df) * 100:.1f}%)"
257+
)
258+
259+
return result_df[column_order]

0 commit comments

Comments
 (0)