add report v2

spatel033 · spatel033 · commit 1c218177b162 · 2025-12-30T16:48:39.000Z
diff --git a/app/config.py b/app/config.py
@@ -561,7 +561,7 @@ class Config:
     NOTIFICATION_DEEP_HISTORY_DELETE_ARCHIVED = os.environ.get("NOTIFICATION_DEEP_HISTORY_DELETE_ARCHIVED", "1") == "1"
 
     REPORT_REQUEST_NOTIFICATIONS_TIMEOUT_MINUTES = 30
-    REPORT_REQUEST_NOTIFICATIONS_CSV_BATCH_SIZE = 2500
+    REPORT_REQUEST_NOTIFICATIONS_CSV_BATCH_SIZE = 50000
 
 
 ######################
diff --git a/app/db_copy_utils.py b/app/db_copy_utils.py
@@ -0,0 +1,204 @@
+"""
+Database COPY utilities for streaming large result sets directly to CSV.
+
+This module provides functions to execute PostgreSQL COPY commands for efficient
+CSV generation. COPY runs entirely in the database engine, avoiding the overhead
+of Python object serialization.
+"""
+
+from io import BytesIO
+from typing import Any
+from uuid import UUID
+
+from flask import current_app
+from sqlalchemy import case, desc, func, text
+from sqlalchemy.orm import aliased
+
+from app import db
+from app.constants import NOTIFICATION_REPORT_REQUEST_MAPPING
+from app.models import ApiKey, Job, Notification, TemplateHistory, User
+
+
+def build_notifications_copy_query(
+    service_id: UUID,
+    notification_type: str,
+    notification_statuses: list[str],
+    limit_days: int,
+    chunk_size: int,
+    older_than_id: UUID | None = None,
+) -> str:
+    # Aliases for table names
+    n = aliased(Notification)
+    t = aliased(TemplateHistory)
+    j = aliased(Job)
+    u = aliased(User)
+    a = aliased(ApiKey)
+
+    # Build recipient column (special handling for letters)
+    recipient_col = case(
+        (n.notification_type == "letter", func.regexp_replace(func.split_part(n.to, "\n", 1), "^ +| +$| ,$", "", "g")),
+        else_=n.to,
+    ).label("Recipient")
+
+    # Build status column
+    permanent_failure_msg = case(
+        (n.notification_type == "email", "Email address doesn't exist"),
+        (n.notification_type == "sms", "Phone number doesn't exist"),
+        else_="Permanent failure",
+    )
+
+    status_col = case(
+        (n.status == "created", "Sending"),
+        (n.status == "sending", "Sending"),
+        (n.status == "pending", "Sending"),
+        (n.status == "sent", "Sent"),
+        (n.status == "delivered", "Delivered"),
+        (n.status == "pending-virus-check", "Pending virus check"),
+        (n.status == "virus-scan-failed", "Virus detected"),
+        (n.status == "returned-letter", "Returned letter"),
+        (n.status == "failed", "Failed"),
+        (n.status == "technical-failure", "Tech issue"),
+        (n.status == "temporary-failure", "Inbox not accepting messages right now"),
+        (n.status == "permanent-failure", permanent_failure_msg),
+        (n.status == "cancelled", "Cancelled"),
+        (n.status == "validation-failed", "Validation failed"),
+        else_=n.status,
+    ).label("Status")
+
+    # Build time column
+    time_col = func.to_char(
+        func.timezone("Europe/London", func.timezone("UTC", n.created_at)), "YYYY-MM-DD HH24:MI:SS"
+    ).label("Time")
+
+    # Build query
+    query = (
+        db.session.query(
+            recipient_col,
+            func.coalesce(n.client_reference, "").label("Reference"),
+            t.name.label("Template"),
+            n.notification_type.label("Type"),
+            func.coalesce(u.name, "").label("Sent by"),
+            func.coalesce(u.email_address, "").label("Sent by email"),
+            func.coalesce(j.original_file_name, "").label("Job"),
+            status_col,
+            time_col,
+            func.coalesce(a.name, "").label("API key name"),
+            n.id,  # For pagination (not exported to CSV)
+            n.created_at,  # For pagination (not exported to CSV)
+        )
+        .select_from(n)
+        .join(t, t.id == n.template_id)
+        .outerjoin(j, j.id == n.job_id)
+        .outerjoin(u, u.id == n.created_by_id)
+        .outerjoin(a, a.id == n.api_key_id)
+    )
+
+    # Add filters
+    query = query.filter(
+        n.service_id == service_id,
+        n.notification_type == notification_type,
+        n.created_at >= func.now() - text(f"interval '{limit_days} days'"),
+        n.key_type != "test",
+    )
+
+    if notification_statuses:
+        query = query.filter(n.status.in_(notification_statuses))
+
+    if older_than_id:
+        # For pagination: fetch older notifications
+        older_than_created_at = db.session.query(Notification.created_at).filter(
+            Notification.id == older_than_id, Notification.service_id == service_id
+        ).scalar_subquery()
+        query = query.filter(n.created_at < older_than_created_at)
+
+    # Add ordering and limit
+    query = query.order_by(desc(n.created_at), desc(n.id)).limit(chunk_size)
+
+    # Compile to SQL string
+    compiled = query.statement.compile(dialect=db.engine.dialect, compile_kwargs={"literal_binds": True})
+    return str(compiled)
+
+def execute_copy_to_bytes(
+    query: str,
+    include_header: bool = True,
+) -> tuple[bytes, UUID | None, int]:
+    # Get a raw connection from the pool
+    conn = db.engine.raw_connection()
+    try:
+        cursor = conn.cursor()
+
+        # First, execute the query to get the last notification ID
+        # We need this for pagination
+        cursor.execute(query.strip())
+        rows = cursor.fetchall()
+        row_count = len(rows)
+
+        last_id = None
+        if row_count > 0:
+            # The last two columns are id and created_at (used only for pagination)
+            last_id = rows[-1][-2]  # second-to-last column is the ID
+
+        # Now build the COPY command to export the same data as CSV
+        # We exclude the id and created_at columns from CSV output
+        copy_query = f"""
+        COPY (
+            SELECT 
+                "Recipient",
+                "Reference",
+                "Template",
+                "Type",
+                "Sent by",
+                "Sent by email",
+                "Job",
+                "Status",
+                "Time",
+                "API key name"
+            FROM ({query.strip()}) AS subquery
+        ) TO STDOUT WITH CSV {"HEADER" if include_header else ""}
+        """
+
+        # Create a BytesIO buffer to capture the COPY output
+        buffer = BytesIO()
+
+        # Execute COPY and stream to buffer
+        cursor.copy_expert(copy_query, buffer)
+
+        # Get the bytes from the buffer
+        csv_bytes = buffer.getvalue()
+
+        current_app.logger.info(
+            f"COPY command executed successfully. "
+            f"Rows: {row_count}, "
+            f"Size: {len(csv_bytes)} bytes, "
+            f"Last ID: {last_id}"
+        )
+
+        return csv_bytes, last_id, row_count
+
+    finally:
+        # Always close the raw connection
+        conn.close()
+
+
+def get_notifications_csv_chunk(
+    service_id: UUID,
+    notification_type: str,
+    notification_status_filter: str,
+    limit_days: int,
+    chunk_size: int,
+    older_than_id: UUID | None = None,
+    include_header: bool = True,
+) -> tuple[bytes, UUID | None, int]:
+    # Convert status filter to actual statuses
+    notification_statuses = NOTIFICATION_REPORT_REQUEST_MAPPING.get(notification_status_filter, [])
+
+    query = build_notifications_copy_query(
+        service_id=service_id,
+        notification_type=notification_type,
+        notification_statuses=notification_statuses,
+        limit_days=limit_days,
+        chunk_size=chunk_size,
+        older_than_id=older_than_id,
+    )
+
+    return execute_copy_to_bytes(query, include_header=include_header)
diff --git a/app/report_requests/process_notifications_report.py b/app/report_requests/process_notifications_report.py
@@ -1,5 +1,3 @@
-import csv
-from io import StringIO
 from typing import Any
 from uuid import UUID
 
@@ -12,10 +10,9 @@
     s3_multipart_upload_part,
 )
 
-from app.constants import NOTIFICATION_REPORT_REQUEST_MAPPING
-from app.dao.notifications_dao import get_notifications_for_service
 from app.dao.report_requests_dao import dao_get_report_request_by_id
 from app.dao.service_data_retention_dao import fetch_service_data_retention_by_notification_type
+from app.db_copy_utils import get_notifications_csv_chunk
 
 
 class ReportRequestProcessor:
@@ -25,17 +22,16 @@ def __init__(self, service_id: UUID, report_request_id: UUID):
         self.report_request = dao_get_report_request_by_id(service_id, report_request_id)
         self.notification_type = self.report_request.parameter["notification_type"]
         self.notification_status = self.report_request.parameter["notification_status"]
-        self.page_size = current_app.config.get("REPORT_REQUEST_NOTIFICATIONS_CSV_BATCH_SIZE")
+        self.chunk_size = current_app.config.get("REPORT_REQUEST_NOTIFICATIONS_CSV_BATCH_SIZE")
         self.s3_bucket = current_app.config["S3_BUCKET_REPORT_REQUESTS_DOWNLOAD"]
         self.filename = f"notifications_report/{report_request_id}.csv"
         self.upload_id: str | None = None
         self.parts: list[dict[str, Any]] = []
         self.part_number = 1
-        self.csv_buffer = StringIO()
-        self.csv_writer = csv.writer(self.csv_buffer)
+        self.current_buffer = b""
+        self.is_first_chunk = True
 
     def process(self) -> None:
-        self._initialize_csv()
         self._start_multipart_upload()
 
         try:
@@ -46,20 +42,7 @@ def process(self) -> None:
             self._abort_upload()
             raise e
 
-    def _initialize_csv(self) -> None:
-        headers = [
-            "Recipient",
-            "Reference",
-            "Template",
-            "Type",
-            "Sent by",
-            "Sent by email",
-            "Job",
-            "Status",
-            "Time",
-            "API key name",
-        ]
-        self.csv_writer.writerow(headers)
+
 
     def _start_multipart_upload(self) -> None:
         response = s3_multipart_upload_create(self.s3_bucket, self.filename)
@@ -68,78 +51,46 @@ def _start_multipart_upload(self) -> None:
     def _fetch_and_upload_notifications(self) -> None:
         service_retention = fetch_service_data_retention_by_notification_type(self.service_id, self.notification_type)
         limit_days = service_retention.days_of_retention if service_retention else 7
-        older_than = None
-        is_notification = True
-        while is_notification:
-            serialized_notifications = self._fetch_serialized_notifications(limit_days, older_than)
+        older_than_id = None
+        has_more_rows = True
+
+        while has_more_rows:
+            # Get a chunk of CSV data from the database using COPY
+            csv_bytes, last_id, row_count = get_notifications_csv_chunk(
+                service_id=self.service_id,
+                notification_type=self.notification_type,
+                notification_status_filter=self.notification_status,
+                limit_days=limit_days,
+                chunk_size=self.chunk_size,
+                older_than_id=older_than_id,
+                include_header=self.is_first_chunk,
+            )
+
+            # Add to buffer
+            self.current_buffer += csv_bytes
+
+            # Upload part if buffer is large enough
+            self._upload_part_if_needed()
 
-            is_notification = len(serialized_notifications) != 0
+            # Update pagination
+            has_more_rows = row_count >= self.chunk_size
+            older_than_id = last_id
+            self.is_first_chunk = False
 
-            csv_data = self._convert_notifications_to_csv(serialized_notifications)
-            self.csv_writer.writerows(csv_data)
-            self._upload_csv_part_if_needed()
-            older_than = serialized_notifications[-1]["id"] if is_notification else None
         # Upload any remaining data
         self._upload_remaining_data()
 
-    def _fetch_serialized_notifications(self, limit_days: int, older_than: str | None) -> list[dict[str, Any]]:
-        statuses = NOTIFICATION_REPORT_REQUEST_MAPPING[self.notification_status]
-
-        notifications = get_notifications_for_service(
-            service_id=self.service_id,
-            filter_dict={
-                "template_type": self.notification_type,
-                "status": statuses,
-            },
-            page_size=self.page_size,
-            count_pages=False,
-            limit_days=limit_days,
-            include_jobs=True,
-            with_personalisation=False,
-            include_from_test_key=False,
-            error_out=False,
-            include_one_off=True,
-            older_than=older_than,
-        )
-
-        serialized_notifications = [notification.serialize_for_csv() for notification in notifications]
-        return serialized_notifications
-
-    def _convert_notifications_to_csv(self, serialized_notifications: list[dict[str, Any]]) -> list[tuple]:
-        values = []
-        for notification in serialized_notifications:
-            values.append(
-                (
-                    # the recipient for precompiled letters is the full address block
-                    notification["recipient"].splitlines()[0].lstrip().rstrip(" ,"),
-                    notification["client_reference"],
-                    notification["template_name"],
-                    notification["template_type"],
-                    notification["created_by_name"] or "",
-                    notification["created_by_email_address"] or "",
-                    notification["job_name"] or "",
-                    notification["status"],
-                    notification["created_at"],
-                    notification["api_key_name"] or "",
-                )
-            )
-        return values
 
-    def _upload_csv_part_if_needed(self) -> None:
-        data_bytes = self.csv_buffer.getvalue().encode("utf-8")
-        if len(data_bytes) >= S3_MULTIPART_UPLOAD_MIN_PART_SIZE:
-            self._upload_part(data_bytes)
 
+    def _upload_part_if_needed(self) -> None:
+        if len(self.current_buffer) >= S3_MULTIPART_UPLOAD_MIN_PART_SIZE:
+            self._upload_part(self.current_buffer)
             # Reset the buffer for the next part
-            # truncate(0) does not reset the cursor so seek(0) is needed to reset the cursor
-            self.csv_buffer.seek(0)
-            self.csv_buffer.truncate(0)
-            self.csv_writer = csv.writer(self.csv_buffer)
+            self.current_buffer = b""
 
     def _upload_remaining_data(self) -> None:
-        data_bytes = self.csv_buffer.getvalue().encode("utf-8")
-        if len(data_bytes) > 0:
-            self._upload_part(data_bytes)
+        if len(self.current_buffer) > 0:
+            self._upload_part(self.current_buffer)
 
     def _upload_part(self, data_bytes: bytes) -> None:
         response = s3_multipart_upload_part(