prowler-cloud
diff --git a/‎api/CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎api/CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/src/backend/tasks/jobs/attack_paths/config.py‎
Lines changed: 5 additions & 3 deletions b/‎api/src/backend/tasks/jobs/attack_paths/config.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎api/src/backend/tasks/jobs/attack_paths/findings.py‎
Lines changed: 57 additions & 85 deletions b/‎api/src/backend/tasks/jobs/attack_paths/findings.py‎
Lines changed: 57 additions & 85 deletions
@@ -12,6 +12,7 @@ All notable changes to the **Prowler API** are documented in this file.
 
 - Attack Paths: Complete migration to private graph labels and properties, removing deprecated dual-write support [(#10268)](https://github.com/prowler-cloud/prowler/pull/10268)
 - Attack Paths: Added tenant and provider related labels to the nodes so they can be easily filtered on custom queries [(#10308)](https://github.com/prowler-cloud/prowler/pull/10308)
+- Attack Paths: Reduce sync and findings memory usage with smaller batches, cursor iteration, and sequential sessions [(#10359)](https://github.com/prowler-cloud/prowler/pull/10359)
 
 ### 🐞 Fixed
 
 
@@ -3,12 +3,14 @@
 from uuid import UUID
 
 from config.env import env
-
 from tasks.jobs.attack_paths import aws
 
-
-# Batch size for Neo4j operations
+# Batch size for Neo4j write operations (resource labeling, cleanup)
 BATCH_SIZE = env.int("ATTACK_PATHS_BATCH_SIZE", 1000)
+# Batch size for Postgres findings fetch (keyset pagination page size)
+FINDINGS_BATCH_SIZE = env.int("ATTACK_PATHS_FINDINGS_BATCH_SIZE", 500)
+# Batch size for temp-to-tenant graph sync (nodes and relationships per cursor page)
+SYNC_BATCH_SIZE = env.int("ATTACK_PATHS_SYNC_BATCH_SIZE", 250)
 
 # Neo4j internal labels (Prowler-specific, not provider-specific)
 # - `Internet`: Singleton node representing external internet access for exposed-resource queries
 
@@ -9,22 +9,15 @@
 """
 
 from collections import defaultdict
-from dataclasses import asdict, dataclass, fields
 from typing import Any, Generator
 from uuid import UUID
 
 import neo4j
-
 from cartography.config import Config as CartographyConfig
 from celery.utils.log import get_task_logger
-
-from api.db_router import READ_REPLICA_ALIAS
-from api.db_utils import rls_transaction
-from api.models import Finding as FindingModel
-from api.models import Provider, ResourceFindingMapping
-from prowler.config import config as ProwlerConfig
 from tasks.jobs.attack_paths.config import (
     BATCH_SIZE,
+    FINDINGS_BATCH_SIZE,
     get_node_uid_field,
     get_provider_resource_label,
     get_root_node_label,
@@ -37,75 +30,54 @@
     render_cypher_template,
 )
 
-logger = get_task_logger(__name__)
-
-
-# Type Definitions
-# -----------------
-
-# Maps dataclass field names to Django ORM query field names
-_DB_FIELD_MAP: dict[str, str] = {
-    "check_title": "check_metadata__checktitle",
-}
-
-
-@dataclass(slots=True)
-class Finding:
-    """
-    Finding data for Neo4j ingestion.
-
-    Can be created from a Django .values() query result using from_db_record().
-    """
+from api.db_router import READ_REPLICA_ALIAS
+from api.db_utils import rls_transaction
+from api.models import Finding as FindingModel
+from api.models import Provider, ResourceFindingMapping
+from prowler.config import config as ProwlerConfig
 
-    id: str
-    uid: str
-    inserted_at: str
-    updated_at: str
-    first_seen_at: str
-    scan_id: str
-    delta: str
-    status: str
-    status_extended: str
-    severity: str
-    check_id: str
-    check_title: str
-    muted: bool
-    muted_reason: str | None
-    resource_uid: str | None = None
-
-    @classmethod
-    def get_db_query_fields(cls) -> tuple[str, ...]:
-        """Get field names for Django .values() query."""
-        return tuple(
-            _DB_FIELD_MAP.get(f.name, f.name)
-            for f in fields(cls)
-            if f.name != "resource_uid"
-        )
+logger = get_task_logger(__name__)
 
-    @classmethod
-    def from_db_record(cls, record: dict[str, Any], resource_uid: str) -> "Finding":
-        """Create a Finding from a Django .values() query result."""
-        return cls(
-            id=str(record["id"]),
-            uid=record["uid"],
-            inserted_at=record["inserted_at"],
-            updated_at=record["updated_at"],
-            first_seen_at=record["first_seen_at"],
-            scan_id=str(record["scan_id"]),
-            delta=record["delta"],
-            status=record["status"],
-            status_extended=record["status_extended"],
-            severity=record["severity"],
-            check_id=str(record["check_id"]),
-            check_title=record["check_metadata__checktitle"],
-            muted=record["muted"],
-            muted_reason=record["muted_reason"],
-            resource_uid=resource_uid,
-        )
 
-    def to_dict(self) -> dict[str, Any]:
-        """Convert to dict for Neo4j ingestion."""
-        return asdict(self)
+# Django ORM field names for `.values()` queries
+# Most map 1:1 to Neo4j property names, exceptions are remapped in `_to_neo4j_dict`
+_DB_QUERY_FIELDS = [
+    "id",
+    "uid",
+    "inserted_at",
+    "updated_at",
+    "first_seen_at",
+    "scan_id",
+    "delta",
+    "status",
+    "status_extended",
+    "severity",
+    "check_id",
+    "check_metadata__checktitle",
+    "muted",
+    "muted_reason",
+]
+
+
+def _to_neo4j_dict(record: dict[str, Any], resource_uid: str) -> dict[str, Any]:
+    """Transform a Django `.values()` record into a `dict` ready for Neo4j ingestion."""
+    return {
+        "id": str(record["id"]),
+        "uid": record["uid"],
+        "inserted_at": record["inserted_at"],
+        "updated_at": record["updated_at"],
+        "first_seen_at": record["first_seen_at"],
+        "scan_id": str(record["scan_id"]),
+        "delta": record["delta"],
+        "status": record["status"],
+        "status_extended": record["status_extended"],
+        "severity": record["severity"],
+        "check_id": str(record["check_id"]),
+        "check_title": record["check_metadata__checktitle"],
+        "muted": record["muted"],
+        "muted_reason": record["muted_reason"],
+        "resource_uid": resource_uid,
+    }
 
 
 # Public API
@@ -180,7 +152,7 @@ def add_resource_label(
 
 def load_findings(
     neo4j_session: neo4j.Session,
-    findings_batches: Generator[list[Finding], None, None],
+    findings_batches: Generator[list[dict[str, Any]], None, None],
     prowler_api_provider: Provider,
     config: CartographyConfig,
 ) -> None:
@@ -209,7 +181,7 @@ def load_findings(
         batch_size = len(batch)
         total_records += batch_size
 
-        parameters["findings_data"] = [f.to_dict() for f in batch]
+        parameters["findings_data"] = batch
 
         logger.info(f"Loading findings batch {batch_num} ({batch_size} records)")
         neo4j_session.run(query, parameters)
@@ -247,16 +219,17 @@ def cleanup_findings(
 def stream_findings_with_resources(
     prowler_api_provider: Provider,
     scan_id: str,
-) -> Generator[list[Finding], None, None]:
+) -> Generator[list[dict[str, Any]], None, None]:
     """
     Stream findings with their associated resources in batches.
 
     Uses keyset pagination for efficient traversal of large datasets.
-    Memory efficient: yields one batch at a time, never holds all findings in memory.
+    Memory efficient: yields one batch at a time as dicts ready for Neo4j ingestion,
+    never holds all findings in memory.
     """
     logger.info(
         f"Starting findings stream for scan {scan_id} "
-        f"(tenant {prowler_api_provider.tenant_id}) with batch size {BATCH_SIZE}"
+        f"(tenant {prowler_api_provider.tenant_id}) with batch size {FINDINGS_BATCH_SIZE}"
     )
 
     tenant_id = prowler_api_provider.tenant_id
@@ -305,15 +278,14 @@ def _fetch_findings_batch(
     Uses read replica and RLS-scoped transaction.
     """
     with rls_transaction(tenant_id, using=READ_REPLICA_ALIAS):
-        # Use all_objects to avoid the ActiveProviderManager's implicit JOIN
-        # through Scan -> Provider (to check is_deleted=False).
-        # The provider is already validated as active in this context.
+        # Use `all_objects` to get `Findings` even on soft-deleted `Providers`
+        # But even the provider is already validated as active in this context
         qs = FindingModel.all_objects.filter(scan_id=scan_id).order_by("id")
 
         if after_id is not None:
             qs = qs.filter(id__gt=after_id)
 
-        return list(qs.values(*Finding.get_db_query_fields())[:BATCH_SIZE])
+        return list(qs.values(*_DB_QUERY_FIELDS)[:FINDINGS_BATCH_SIZE])
 
 
 # Batch Enrichment
@@ -323,7 +295,7 @@ def _fetch_findings_batch(
 def _enrich_batch_with_resources(
     findings_batch: list[dict[str, Any]],
     tenant_id: str,
-) -> list[Finding]:
+) -> list[dict[str, Any]]:
     """
     Enrich findings with their resource UIDs.
 
@@ -334,7 +306,7 @@ def _enrich_batch_with_resources(
     resource_map = _build_finding_resource_map(finding_ids, tenant_id)
 
     return [
-        Finding.from_db_record(finding, resource_uid)
+        _to_neo4j_dict(finding, resource_uid)
         for finding in findings_batch
         for resource_uid in resource_map.get(finding["id"], [])
     ]