Skip to content

Commit 3a50bf3

Browse files
committed
Add PDF text extraction and anonymization utilities
Introduced functions to extract text from PDFs and anonymize sensitive data using regex and deterministic hashing. Integrated these utilities into a new API endpoint for analyzing and anonymizing uploaded PDF content. Removed unused PyMuPDF import to clean up dependencies.
1 parent bc749e8 commit 3a50bf3

File tree

3 files changed

+111
-2
lines changed

3 files changed

+111
-2
lines changed

app/api/v1/endpoints/statements.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
from app.models.statement import StatementCategory
1616
from app.core.exceptions import ValidationError, FileProcessingError
1717
from app.core.logging import get_logger
18+
import os
19+
import tempfile
20+
from app.services.pdf_service import PDFExcelService
21+
1822

1923
router = APIRouter()
2024
logger = get_logger(__name__)
@@ -297,4 +301,45 @@ def bulk_delete_statements(
297301
raise HTTPException(
298302
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
299303
detail="Bulk delete failed"
300-
)
304+
)
305+
306+
@router.post("/test-pdf-hashing")
307+
async def analyze_statement(file: UploadFile = File(...)):
308+
patterns = {
309+
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
310+
"phone": r'\b(?:\+234|0)([789]\d{9})\b',
311+
"account_number": r'\b\d{10,20}\b',
312+
"address": r'\d+\s+\w+(?:\s+\w+)*\s+(Street|St|Avenue|Ave|Close|Rd|Road|Lane|Ln|Crescent|Cres)\b',
313+
"name": r'\b[A-Z][a-z]+ [A-Z][a-z]+\b',
314+
"bvn": r'\b\d{11}\b',
315+
"ssn": r'\b\d{3}-\d{2}-\d{4}\b',
316+
"credit_card": r'\b(?:\d{4}[- ]?){3}\d{4}\b',
317+
"routing_number": r'\b\d{9}\b',
318+
"iban": r'\b[A-Za-z]{2}\d{2}[A-Za-z0-9]{11,30}\b',
319+
# "swift_code": r'\b[A-Z]{4}[A-Z]{2}[A-Za-z0-9]{2}([A-Za-z0-9]{3})?\b',
320+
# "currency": r'\b[\$€¥₹£]?\d{1,3}(,\d{3})*(\.\d{1,2})?\b',
321+
"tin": r'\b\d{2}-\d{7}\b',
322+
"pan": r'\b[A-Z]{5}\d{4}[A-Z]\b',
323+
"gstin": r'\b\d{2}[A-Z]{5}\d{4}[A-Z][1-9A-Z][Z][A-Z0-9]\b',
324+
# "cheque_number": r'\b\d{6,9}\b',
325+
"reference_number": r'\b[A-Z]{3,4}-\d{6,10}\b',
326+
# "business_reg": r'\b[A-Z0-9]{7,15}\b',
327+
# "sort_code": r'\b\d{9}\b',
328+
"ip_address": r'\b\d{1,3}(?:\.\d{1,3}){3}\b',
329+
"ipv4": r'\b\d{1,3}(?:\.\d{1,3}){3}\b',
330+
"url": r'\bhttps?://[^\s]+\b',
331+
"merchant_id": r'\b(M|C)-[A-Za-z0-9]{6,12}\b',
332+
"nin": r'\b\d{11}\b',
333+
}
334+
# Save uploaded file temporarily
335+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
336+
tmp.write(await file.read())
337+
tmp_path = tmp.name
338+
339+
try:
340+
text = PDFExcelService.extract_text_from_pdf(tmp_path)
341+
anonymized_text = PDFExcelService.anonymize_text(text, patterns)
342+
# Send anonymized_text to Gemini LLM here
343+
return {"anonymized_text": anonymized_text}
344+
finally:
345+
os.unlink(tmp_path) # Clean up

app/services/ai_service.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import re
1919
import hashlib
2020
from typing import Dict, Any, List
21-
import fitz # PyMuPDF for PDF processing
2221
from dataclasses import dataclass
2322

2423
@dataclass

app/services/pdf_service.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@
1010
from app.core.config import settings
1111
from app.core.logging import LoggerMixin
1212
from app.core.exceptions import ExternalServiceError, FileProcessingError
13+
import pdfplumber
14+
import PyPDF2
15+
import re
16+
from cryptography.hazmat.primitives import hashes
17+
from cryptography.hazmat.backends import default_backend
18+
from cryptography.hazmat.primitives import hashes
1319

1420

1521
class PDFExcelService(LoggerMixin):
@@ -265,5 +271,64 @@ def extract_metadata(self, df: pd.DataFrame) -> Dict[str, Any]:
265271
return {}
266272

267273

274+
def extract_text_from_pdf(pdf_path):
275+
text = ""
276+
try:
277+
with pdfplumber.open(pdf_path) as pdf:
278+
for page in pdf.pages:
279+
text += page.extract_text() + "\n"
280+
except:
281+
# Fallback to PyPDF2
282+
with open(pdf_path, "rb") as f:
283+
reader = PyPDF2.PdfReader(f)
284+
for page in reader.pages:
285+
text += page.extract_text() + "\n"
286+
return text
287+
288+
@staticmethod
289+
def deterministic_hash(value: str, salt: str = "fixed_salt_123") -> str:
290+
digest = hashes.Hash(hashes.SHA256(), backend=default_backend())
291+
digest.update((value + salt).encode())
292+
return digest.finalize().hex()[:16]
293+
294+
@staticmethod
295+
def anonymize_text(text: str, patterns: Dict[str, str]) -> str:
296+
# Compile all patterns once
297+
compiled_patterns = {
298+
key: re.compile(pattern)
299+
for key, pattern in patterns.items()
300+
}
301+
302+
# Create a list of all matches with their positions
303+
matches = []
304+
for key, pattern in compiled_patterns.items():
305+
for match in pattern.finditer(text):
306+
start, end = match.span()
307+
matches.append((
308+
start,
309+
end,
310+
match.group(),
311+
key
312+
))
313+
314+
# Sort matches by position in reverse order
315+
# This ensures replacements don't affect other matches' positions
316+
matches.sort(key=lambda x: x[0], reverse=True)
317+
318+
# Convert text to list of characters for efficient manipulation
319+
text_chars = list(text)
320+
321+
# Process each match
322+
for start, end, original, key in matches:
323+
hashed_value = PDFExcelService.deterministic_hash(original)
324+
replacement = f"{key.upper()}_{hashed_value}"
325+
text_chars[start:end] = replacement
326+
327+
# Join characters back into string
328+
return ''.join(text_chars)
329+
330+
331+
332+
268333
# Create service instance
269334
pdf_service = PDFExcelService()

0 commit comments

Comments
 (0)