Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions backend/app/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from fuzzy_match_helper import create_ocr_matched_df, create_select_voter_records
from ocr_helper import create_ocr_df
from routers import file
from settings.settings_repo import config
from utils import logger

from .matcher import create_ocr_matched_df
from .ocr import create_ocr_df

app = FastAPI(root_path="/api")
app.state.voter_records_df = None

Expand Down Expand Up @@ -48,12 +49,10 @@ def ocr(response: Response):

logger.info("Compiling Voter Record Data...")

select_voter_records = create_select_voter_records(app.state.voter_records_df)

logger.info("Matching petition signatures to voter records...")

ocr_matched_df = create_ocr_matched_df(
ocr_df, select_voter_records, threshold=config["BASE_THRESHOLD"]
ocr_df, app.state.voter_records_df, threshold=config["BASE_THRESHOLD"]
)
response.headers["Content-Type"] = "application/json"
return {"data": ocr_matched_df.to_dict(orient="records"), "stats": {}}
3 changes: 3 additions & 0 deletions backend/app/matcher/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .voter_matcher import create_ocr_matched_df

__all__ = ["create_ocr_matched_df"]
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
# needed libraries
### structured outputs; replacements
import os
import json
from typing import List, Tuple
from tqdm.notebook import tqdm
from rapidfuzz import fuzz
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import logging
import os
from concurrent.futures import ProcessPoolExecutor
from datetime import datetime

# local environment storage
repo_name = "Ballot-Initiative"
REPODIR = os.getcwd()
load_dotenv(os.path.join(REPODIR, ".env"), override=True)
import numpy as np
import pandas as pd
from pandas import DataFrame
from rapidfuzz import fuzz
from tqdm import tqdm

# load config
with open("config.json", "r") as f:
Expand Down Expand Up @@ -49,7 +44,7 @@
###


def create_select_voter_records(voter_records: pd.DataFrame) -> pd.DataFrame:
def _create_select_voter_records(voter_records: pd.DataFrame) -> pd.DataFrame:
"""
Creates a simplified DataFrame with full names and addresses from voter records.

Expand Down Expand Up @@ -84,19 +79,19 @@ def create_select_voter_records(voter_records: pd.DataFrame) -> pd.DataFrame:


def score_fuzzy_match_slim(
ocr_result: str, comparison_list: List[str], scorer_=fuzz.ratio, limit_=10
) -> List[Tuple[str, int, int]]:
ocr_result: str, comparison_list: list[str], scorer_=fuzz.ratio, limit_=10
) -> list[tuple[str, int, int]]:
"""
Scores the fuzzy match between the OCR result and the comparison list.

Args:
ocr_result (str): The OCR result to match.
comparison_list (List[str]): The list of strings to compare against.
comparison_list (list[str]): The list of strings to compare against.
scorer_ (function): The scorer function to use.
limit_ (int): The number of top matches to return.

Returns:
List[Tuple[str, int, int]]: The list of top matches with their scores and indices.
list[tuple[str, int, int]]: The list of top matches with their scores and indices.
"""
logger.debug(f"Starting fuzzy matching for: {ocr_result[:30]}...")

Expand All @@ -118,9 +113,9 @@ def score_fuzzy_match_slim(
return results


def get_matched_name_address(
def _get_matched_name_address(
ocr_name: str, ocr_address: str, select_voter_records: pd.DataFrame
) -> List[Tuple[str, str, float, int]]:
) -> list[tuple[str, str, float, int]]:
"""
Optimized name and address matching

Expand All @@ -130,7 +125,7 @@ def get_matched_name_address(
select_voter_records (pd.DataFrame): The DataFrame containing voter records.

Returns:
List[Tuple[str, str, float, int]]: The list of top matches with their scores and indices.
list[tuple[str, str, float, int]]: The list of top matches with their scores and indices.
"""
logger.debug(f"Matching - Name: {ocr_name[:30]}... Address: {ocr_address[:30]}...")

Expand Down Expand Up @@ -168,7 +163,7 @@ def get_matched_name_address(

def create_ocr_matched_df(
ocr_df: pd.DataFrame,
select_voter_records: pd.DataFrame,
voter_records: pd.DataFrame,
threshold: float = config["BASE_THRESHOLD"],
st_bar=None,
) -> pd.DataFrame:
Expand All @@ -188,6 +183,10 @@ def create_ocr_matched_df(
f"Starting matching process for {len(ocr_df)} records with threshold {threshold}"
)

select_voter_records: DataFrame = _create_select_voter_records(
voter_records=voter_records
)

# Process in batches for better memory management
batch_size = 1000
results = []
Expand All @@ -199,10 +198,10 @@ def create_ocr_matched_df(
)

# Process batch in parallel
with ThreadPoolExecutor() as executor:
with ProcessPoolExecutor() as executor:
batch_results = list(
executor.map(
lambda row: get_matched_name_address(
lambda row: _get_matched_name_address(
row["OCR Name"], row["OCR Address"], select_voter_records
),
[row for _, row in batch.iterrows()],
Expand Down
4 changes: 2 additions & 2 deletions backend/app/ocr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .ocr_client_factory import extract_from_encoding_async
from .ocr_helper import create_ocr_df

__all__ = ["extract_from_encoding_async"]
__all__ = ["create_ocr_df"]
47 changes: 21 additions & 26 deletions backend/app/ocr_helper.py → backend/app/ocr/ocr_helper.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
from typing import List
import asyncio
import base64
import os
import json
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import pandas as pd
import asyncio
import fitz # Add this import at the top with other imports

import logging
import os
from datetime import datetime

from ocr import extract_from_encoding_async
import fitz # Add this import at the top with other imports
import pandas as pd
from tqdm import tqdm

from .ocr_client_factory import extract_from_encoding_async

# Set up logging
log_directory = "logs"
Expand All @@ -36,9 +34,6 @@
logger.addHandler(file_handler)
logger.addHandler(console_handler)

repo_name = "Ballot-Initiative"
REPODIR = os.getcwd()
load_dotenv(os.path.join(REPODIR, ".env"), override=True)

# open ai api key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
Expand All @@ -50,7 +45,7 @@
config = json.load(f)


def collecting_pdf_encoded_images(file_path: str) -> List[str]:
def _collecting_pdf_encoded_images(file_path: str) -> list[str]:
"""Convert PDF pages to encoded images, cropping to target area.
Returns list of base64 encoded image strings."""

Expand Down Expand Up @@ -97,17 +92,17 @@ def collecting_pdf_encoded_images(file_path: str) -> List[str]:


# function for adding data
def add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[dict]:
def _add_metadata(initial_data: list[dict], page_no: int, filename: str) -> list[dict]:
"""
Adds page number, row number, and filename metadata to the recognized signatures

Args:
initial_data (List[dict]): The initial data to add metadata to.
initial_data (list[dict]): The initial data to add metadata to.
page_no (int): The page number of the current page.
filename (str): The name of the file.

Returns:
List[dict]: The final data with metadata.
list[dict]: The final data with metadata.
"""

final_data = list()
Expand All @@ -121,7 +116,7 @@ def add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[
return final_data


async def process_batch_async(encodings: List[str]) -> List[List[dict]]:
async def _process_batch_async(encodings: list[str]) -> list[list[dict]]:
"""
Process a batch of images concurrently
"""
Expand All @@ -132,7 +127,7 @@ async def process_batch_async(encodings: List[str]) -> List[List[dict]]:
return results


def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
def _get_or_create_event_loop() -> asyncio.AbstractEventLoop:
try:
return asyncio.get_event_loop()
except RuntimeError:
Expand All @@ -141,13 +136,13 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
return loop


def collect_ocr_data(
def _collect_ocr_data(
filedir: str,
filename: str,
max_page_num: int = None,
batch_size: int = 10,
st_bar=None,
) -> List[dict]:
) -> list[dict]:
"""
Collects OCR data from a PDF file.

Expand All @@ -165,7 +160,7 @@ def collect_ocr_data(
logger.info(f"Parameters - max_page_num: {max_page_num}, batch_size: {batch_size}")

# collecting images
encoded_images = collecting_pdf_encoded_images(os.path.join(filedir, filename))
encoded_images = _collecting_pdf_encoded_images(os.path.join(filedir, filename))

# selecting pages
if max_page_num:
Expand All @@ -180,7 +175,7 @@ def collect_ocr_data(
total_pages = len(encoded_images)

# getting event loop
loop = get_or_create_event_loop()
loop = _get_or_create_event_loop()

# Process in batches
logger.info(f"Processing {total_pages} pages in batches of {batch_size}")
Expand All @@ -199,12 +194,12 @@ def collect_ocr_data(
)

# Run async batch processing using the event loop
batch_results = loop.run_until_complete(process_batch_async(batch))
batch_results = loop.run_until_complete(_process_batch_async(batch))

# Add metadata for each result in the batch
for page_idx, result in enumerate(batch_results):
current_page = i + page_idx
ocr_data = add_metadata(result, current_page, filename)
ocr_data = _add_metadata(result, current_page, filename)
full_data.extend(ocr_data)

logger.info(
Expand All @@ -218,7 +213,7 @@ def collect_ocr_data(
def create_ocr_df(
filedir: str,
filename: str,
max_page_num: int = None,
max_page_num: int | None = None,
batch_size: int = 10,
st_bar=None,
) -> pd.DataFrame:
Expand All @@ -238,7 +233,7 @@ def create_ocr_df(
logger.info("Starting OCR DataFrame creation")

# gathering ocr_data
ocr_data = collect_ocr_data(
ocr_data = _collect_ocr_data(
filedir,
filename,
max_page_num=max_page_num,
Expand Down
7 changes: 4 additions & 3 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[project]
name = "ballot-petition-signature-verifier"
name = "vote-catcher-server"
version = "0.0.15"
description = "A python package for verifying ballot petition signatures"
description = "A Python server for collecting and verifying votes and ballot petition signatures"
readme = "README.md"
requires-python = "~=3.12"
requires-python = ">=3.12"
authors = [{ name = "Mobolaji Williams" }]
maintainers = [
{ name = "Mobolaji Williams" },
Expand Down Expand Up @@ -36,6 +36,7 @@ dependencies = [
"streamlit>=1.44.1",
"streamlit-shadcn-ui>=0.1.18",
"structlog>=25.2.0",
"supabase>=2.21.1",
"tomli>=2.2.1",
"tomli-w>=1.2.0",
]
Expand Down
Loading