Add PDF text extraction and anonymization utilities

Benji918 · Benji918 · commit 3a50bf3b61cc · 2025-09-07T22:39:32.000+01:00
Introduced functions to extract text from PDFs and anonymize sensitive data using regex and deterministic hashing. Integrated these utilities into a new API endpoint for analyzing and anonymizing uploaded PDF content. Removed unused PyMuPDF import to clean up dependencies.
diff --git a/app/api/v1/endpoints/statements.py b/app/api/v1/endpoints/statements.py
@@ -15,6 +15,10 @@
 from app.models.statement import StatementCategory
 from app.core.exceptions import ValidationError, FileProcessingError
 from app.core.logging import get_logger
+import os
+import tempfile
+from app.services.pdf_service import PDFExcelService
+
 
 router = APIRouter()
 logger = get_logger(__name__)
@@ -297,4 +301,45 @@ def bulk_delete_statements(
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
             detail="Bulk delete failed"
-        )
+        )
+
+@router.post("/test-pdf-hashing")
+async def analyze_statement(file: UploadFile = File(...)):
+    patterns = {
+        "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
+        "phone": r'\b(?:\+234|0)([789]\d{9})\b',
+        "account_number": r'\b\d{10,20}\b',
+        "address": r'\d+\s+\w+(?:\s+\w+)*\s+(Street|St|Avenue|Ave|Close|Rd|Road|Lane|Ln|Crescent|Cres)\b',
+        "name": r'\b[A-Z][a-z]+ [A-Z][a-z]+\b',
+        "bvn": r'\b\d{11}\b',
+        "ssn": r'\b\d{3}-\d{2}-\d{4}\b',
+        "credit_card": r'\b(?:\d{4}[- ]?){3}\d{4}\b',
+        "routing_number": r'\b\d{9}\b',
+        "iban": r'\b[A-Za-z]{2}\d{2}[A-Za-z0-9]{11,30}\b',
+        # "swift_code": r'\b[A-Z]{4}[A-Z]{2}[A-Za-z0-9]{2}([A-Za-z0-9]{3})?\b',
+        # "currency": r'\b[\$€¥₹£]?\d{1,3}(,\d{3})*(\.\d{1,2})?\b',
+        "tin": r'\b\d{2}-\d{7}\b',
+        "pan": r'\b[A-Z]{5}\d{4}[A-Z]\b',
+        "gstin": r'\b\d{2}[A-Z]{5}\d{4}[A-Z][1-9A-Z][Z][A-Z0-9]\b',
+        # "cheque_number": r'\b\d{6,9}\b',
+        "reference_number": r'\b[A-Z]{3,4}-\d{6,10}\b',
+        # "business_reg": r'\b[A-Z0-9]{7,15}\b',
+        # "sort_code": r'\b\d{9}\b',
+        "ip_address": r'\b\d{1,3}(?:\.\d{1,3}){3}\b',
+        "ipv4": r'\b\d{1,3}(?:\.\d{1,3}){3}\b',
+        "url": r'\bhttps?://[^\s]+\b',
+        "merchant_id": r'\b(M|C)-[A-Za-z0-9]{6,12}\b',
+        "nin": r'\b\d{11}\b',
+    }
+    # Save uploaded file temporarily
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        tmp.write(await file.read())
+        tmp_path = tmp.name
+
+    try:
+        text = PDFExcelService.extract_text_from_pdf(tmp_path)
+        anonymized_text = PDFExcelService.anonymize_text(text, patterns)
+        # Send anonymized_text to Gemini LLM here
+        return {"anonymized_text": anonymized_text}
+    finally:
+        os.unlink(tmp_path)  # Clean up
diff --git a/app/services/ai_service.py b/app/services/ai_service.py
@@ -18,7 +18,6 @@
 import re
 import hashlib
 from typing import Dict, Any, List
-import fitz  # PyMuPDF for PDF processing
 from dataclasses import dataclass
 
 @dataclass
diff --git a/app/services/pdf_service.py b/app/services/pdf_service.py
@@ -10,6 +10,12 @@
 from app.core.config import settings
 from app.core.logging import LoggerMixin
 from app.core.exceptions import ExternalServiceError, FileProcessingError
+import pdfplumber
+import PyPDF2
+import re
+from cryptography.hazmat.primitives import hashes
+from cryptography.hazmat.backends import default_backend
+from cryptography.hazmat.primitives import hashes
 
 
 class PDFExcelService(LoggerMixin):
@@ -265,5 +271,64 @@ def extract_metadata(self, df: pd.DataFrame) -> Dict[str, Any]:
             return {}
 
 
+    def extract_text_from_pdf(pdf_path):
+        text = ""
+        try:
+            with pdfplumber.open(pdf_path) as pdf:
+                for page in pdf.pages:
+                    text += page.extract_text() + "\n"
+        except:
+            # Fallback to PyPDF2
+            with open(pdf_path, "rb") as f:
+                reader = PyPDF2.PdfReader(f)
+                for page in reader.pages:
+                    text += page.extract_text() + "\n"
+        return text
+
+    @staticmethod
+    def deterministic_hash(value: str, salt: str = "fixed_salt_123") -> str:
+        digest = hashes.Hash(hashes.SHA256(), backend=default_backend())
+        digest.update((value + salt).encode())
+        return digest.finalize().hex()[:16]
+
+    @staticmethod
+    def anonymize_text(text: str, patterns: Dict[str, str]) -> str:
+        # Compile all patterns once
+        compiled_patterns = {
+            key: re.compile(pattern)
+            for key, pattern in patterns.items()
+        }
+
+        # Create a list of all matches with their positions
+        matches = []
+        for key, pattern in compiled_patterns.items():
+            for match in pattern.finditer(text):
+                start, end = match.span()
+                matches.append((
+                    start,
+                    end,
+                    match.group(),
+                    key
+                ))
+
+        # Sort matches by position in reverse order
+        # This ensures replacements don't affect other matches' positions
+        matches.sort(key=lambda x: x[0], reverse=True)
+
+        # Convert text to list of characters for efficient manipulation
+        text_chars = list(text)
+
+        # Process each match
+        for start, end, original, key in matches:
+            hashed_value = PDFExcelService.deterministic_hash(original)
+            replacement = f"{key.upper()}_{hashed_value}"
+            text_chars[start:end] = replacement
+
+        # Join characters back into string
+        return ''.join(text_chars)
+
+
+
+
 # Create service instance
 pdf_service = PDFExcelService()