civictechdc · kvithayathil · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
@@ -2,12 +2,13 @@
 
 from fastapi import FastAPI, Response
 from fastapi.middleware.cors import CORSMiddleware
-from fuzzy_match_helper import create_ocr_matched_df, create_select_voter_records
-from ocr_helper import create_ocr_df
 from routers import file
 from settings.settings_repo import config
 from utils import logger
 
+from .matcher import create_ocr_matched_df
+from .ocr import create_ocr_df
+
 app = FastAPI(root_path="/api")
 app.state.voter_records_df = None
 
@@ -48,12 +49,10 @@ def ocr(response: Response):
 
     logger.info("Compiling Voter Record Data...")
 
-    select_voter_records = create_select_voter_records(app.state.voter_records_df)
-
     logger.info("Matching petition signatures to voter records...")
 
     ocr_matched_df = create_ocr_matched_df(
-        ocr_df, select_voter_records, threshold=config["BASE_THRESHOLD"]
+        ocr_df, app.state.voter_records_df, threshold=config["BASE_THRESHOLD"]
     )
     response.headers["Content-Type"] = "application/json"
     return {"data": ocr_matched_df.to_dict(orient="records"), "stats": {}}
@@ -0,0 +1,3 @@
+from .voter_matcher import create_ocr_matched_df
+
+__all__ = ["create_ocr_matched_df"]
@@ -1,21 +1,16 @@
 # needed libraries
 ### structured outputs; replacements
-import os
 import json
-from typing import List, Tuple
-from tqdm.notebook import tqdm
-from rapidfuzz import fuzz
-from dotenv import load_dotenv
-import pandas as pd
-import numpy as np
-from concurrent.futures import ThreadPoolExecutor
 import logging
+import os
+from concurrent.futures import ProcessPoolExecutor
 from datetime import datetime
 
-# local environment storage
-repo_name = "Ballot-Initiative"
-REPODIR = os.getcwd()
-load_dotenv(os.path.join(REPODIR, ".env"), override=True)
+import numpy as np
+import pandas as pd
+from pandas import DataFrame
+from rapidfuzz import fuzz
+from tqdm import tqdm
 
 # load config
 with open("config.json", "r") as f:
@@ -49,7 +44,7 @@
 ###
 
 
-def create_select_voter_records(voter_records: pd.DataFrame) -> pd.DataFrame:
+def _create_select_voter_records(voter_records: pd.DataFrame) -> pd.DataFrame:
     """
     Creates a simplified DataFrame with full names and addresses from voter records.
 
@@ -84,19 +79,19 @@ def create_select_voter_records(voter_records: pd.DataFrame) -> pd.DataFrame:
 
 
 def score_fuzzy_match_slim(
-    ocr_result: str, comparison_list: List[str], scorer_=fuzz.ratio, limit_=10
-) -> List[Tuple[str, int, int]]:
+    ocr_result: str, comparison_list: list[str], scorer_=fuzz.ratio, limit_=10
+) -> list[tuple[str, int, int]]:
     """
     Scores the fuzzy match between the OCR result and the comparison list.
 
     Args:
         ocr_result (str): The OCR result to match.
-        comparison_list (List[str]): The list of strings to compare against.
+        comparison_list (list[str]): The list of strings to compare against.
         scorer_ (function): The scorer function to use.
         limit_ (int): The number of top matches to return.
 
     Returns:
-        List[Tuple[str, int, int]]: The list of top matches with their scores and indices.
+        list[tuple[str, int, int]]: The list of top matches with their scores and indices.
     """
     logger.debug(f"Starting fuzzy matching for: {ocr_result[:30]}...")
 
@@ -118,9 +113,9 @@ def score_fuzzy_match_slim(
     return results
 
 
-def get_matched_name_address(
+def _get_matched_name_address(
     ocr_name: str, ocr_address: str, select_voter_records: pd.DataFrame
-) -> List[Tuple[str, str, float, int]]:
+) -> list[tuple[str, str, float, int]]:
     """
     Optimized name and address matching
 
@@ -130,7 +125,7 @@ def get_matched_name_address(
         select_voter_records (pd.DataFrame): The DataFrame containing voter records.
 
     Returns:
-        List[Tuple[str, str, float, int]]: The list of top matches with their scores and indices.
+        list[tuple[str, str, float, int]]: The list of top matches with their scores and indices.
     """
     logger.debug(f"Matching - Name: {ocr_name[:30]}... Address: {ocr_address[:30]}...")
 
@@ -168,7 +163,7 @@ def get_matched_name_address(
 
 def create_ocr_matched_df(
     ocr_df: pd.DataFrame,
-    select_voter_records: pd.DataFrame,
+    voter_records: pd.DataFrame,
     threshold: float = config["BASE_THRESHOLD"],
     st_bar=None,
 ) -> pd.DataFrame:
@@ -188,6 +183,10 @@ def create_ocr_matched_df(
         f"Starting matching process for {len(ocr_df)} records with threshold {threshold}"
     )
 
+    select_voter_records: DataFrame = _create_select_voter_records(
+        voter_records=voter_records
+    )
+
     # Process in batches for better memory management
     batch_size = 1000
     results = []
@@ -199,10 +198,10 @@ def create_ocr_matched_df(
         )
 
         # Process batch in parallel
-        with ThreadPoolExecutor() as executor:
+        with ProcessPoolExecutor() as executor:
             batch_results = list(
                 executor.map(
-                    lambda row: get_matched_name_address(
+                    lambda row: _get_matched_name_address(
                         row["OCR Name"], row["OCR Address"], select_voter_records
                     ),
                     [row for _, row in batch.iterrows()],

@@ -1,3 +1,3 @@
-from .ocr_client_factory import extract_from_encoding_async
+from .ocr_helper import create_ocr_df
 
-__all__ = ["extract_from_encoding_async"]
+__all__ = ["create_ocr_df"]
@@ -1,17 +1,15 @@
-from typing import List
+import asyncio
 import base64
-import os
 import json
-from tqdm.notebook import tqdm
-from dotenv import load_dotenv
-import pandas as pd
-import asyncio
-import fitz  # Add this import at the top with other imports
-
 import logging
+import os
 from datetime import datetime
 
-from ocr import extract_from_encoding_async
+import fitz  # Add this import at the top with other imports
+import pandas as pd
+from tqdm import tqdm
+
+from .ocr_client_factory import extract_from_encoding_async
 
 # Set up logging
 log_directory = "logs"
@@ -36,9 +34,6 @@
 logger.addHandler(file_handler)
 logger.addHandler(console_handler)
 
-repo_name = "Ballot-Initiative"
-REPODIR = os.getcwd()
-load_dotenv(os.path.join(REPODIR, ".env"), override=True)
 
 # open ai api key
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
@@ -50,7 +45,7 @@
     config = json.load(f)
 
 
-def collecting_pdf_encoded_images(file_path: str) -> List[str]:
+def _collecting_pdf_encoded_images(file_path: str) -> list[str]:
     """Convert PDF pages to encoded images, cropping to target area.
     Returns list of base64 encoded image strings."""
 
@@ -97,17 +92,17 @@ def collecting_pdf_encoded_images(file_path: str) -> List[str]:
 
 
 # function for adding data
-def add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[dict]:
+def _add_metadata(initial_data: list[dict], page_no: int, filename: str) -> list[dict]:
     """
     Adds page number, row number, and filename metadata to the recognized signatures
 
     Args:
-        initial_data (List[dict]): The initial data to add metadata to.
+        initial_data (list[dict]): The initial data to add metadata to.
         page_no (int): The page number of the current page.
         filename (str): The name of the file.
 
     Returns:
-        List[dict]: The final data with metadata.
+        list[dict]: The final data with metadata.
     """
 
     final_data = list()
@@ -121,7 +116,7 @@ def add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[
     return final_data
 
 
-async def process_batch_async(encodings: List[str]) -> List[List[dict]]:
+async def _process_batch_async(encodings: list[str]) -> list[list[dict]]:
     """
     Process a batch of images concurrently
     """
@@ -132,7 +127,7 @@ async def process_batch_async(encodings: List[str]) -> List[List[dict]]:
     return results
 
 
-def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
+def _get_or_create_event_loop() -> asyncio.AbstractEventLoop:
     try:
         return asyncio.get_event_loop()
     except RuntimeError:
@@ -141,13 +136,13 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
         return loop
 
 
-def collect_ocr_data(
+def _collect_ocr_data(
     filedir: str,
     filename: str,
     max_page_num: int = None,
     batch_size: int = 10,
     st_bar=None,
-) -> List[dict]:
+) -> list[dict]:
     """
     Collects OCR data from a PDF file.
 
@@ -165,7 +160,7 @@ def collect_ocr_data(
     logger.info(f"Parameters - max_page_num: {max_page_num}, batch_size: {batch_size}")
 
     # collecting images
-    encoded_images = collecting_pdf_encoded_images(os.path.join(filedir, filename))
+    encoded_images = _collecting_pdf_encoded_images(os.path.join(filedir, filename))
 
     # selecting pages
     if max_page_num:
@@ -180,7 +175,7 @@ def collect_ocr_data(
     total_pages = len(encoded_images)
 
     # getting event loop
-    loop = get_or_create_event_loop()
+    loop = _get_or_create_event_loop()
 
     # Process in batches
     logger.info(f"Processing {total_pages} pages in batches of {batch_size}")
@@ -199,12 +194,12 @@ def collect_ocr_data(
             )
 
         # Run async batch processing using the event loop
-        batch_results = loop.run_until_complete(process_batch_async(batch))
+        batch_results = loop.run_until_complete(_process_batch_async(batch))
 
         # Add metadata for each result in the batch
         for page_idx, result in enumerate(batch_results):
             current_page = i + page_idx
-            ocr_data = add_metadata(result, current_page, filename)
+            ocr_data = _add_metadata(result, current_page, filename)
             full_data.extend(ocr_data)
 
         logger.info(
@@ -218,7 +213,7 @@ def collect_ocr_data(
 def create_ocr_df(
     filedir: str,
     filename: str,
-    max_page_num: int = None,
+    max_page_num: int | None = None,
     batch_size: int = 10,
     st_bar=None,
 ) -> pd.DataFrame:
@@ -238,7 +233,7 @@ def create_ocr_df(
     logger.info("Starting OCR DataFrame creation")
 
     # gathering ocr_data
-    ocr_data = collect_ocr_data(
+    ocr_data = _collect_ocr_data(
         filedir,
         filename,
         max_page_num=max_page_num,

@@ -1,9 +1,9 @@
 [project]
-name = "ballot-petition-signature-verifier"
+name = "vote-catcher-server"
 version = "0.0.15"
-description = "A python package for verifying ballot petition signatures"
+description = "A Python server for collecting and verifying votes and ballot petition signatures"
 readme = "README.md"
-requires-python = "~=3.12"
+requires-python = ">=3.12"
 authors = [{ name = "Mobolaji Williams" }]
 maintainers = [
     { name = "Mobolaji Williams" },
@@ -36,6 +36,7 @@ dependencies = [
     "streamlit>=1.44.1",
     "streamlit-shadcn-ui>=0.1.18",
     "structlog>=25.2.0",
+    "supabase>=2.21.1",
     "tomli>=2.2.1",
     "tomli-w>=1.2.0",
 ]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .voter_matcher import create_ocr_matched_df

		__all__ = ["create_ocr_matched_df"]