feat(dbsync): Introduce DBSyncSnapshotService and snapshot freshness test

ArturWieczorek · ArturWieczorek · commit dd09cb20c9a3 · 2025-11-15T00:09:16.000+01:00
Introduces new components to cleanly implement the Cardano DB-Sync snapshot freshness check using the S3 REST API.

* **DBSyncSnapshotService:** A new service class responsible for interacting with the IOHK S3 repository, encapsulating API calls and robust XML parsing logic (including handling S3 namespaces and date formats).
* **Clarity:** Uses `dataclass` for snapshot metadata and standard Pytest best practices.
diff --git a/cardano_node_tests/tests/test_dbsync.py b/cardano_node_tests/tests/test_dbsync.py
@@ -3,6 +3,9 @@
 import logging
 import time
 import typing as tp
+from datetime import datetime
+from datetime import timedelta
+from datetime import timezone
 
 import allure
 import pytest
@@ -18,6 +21,8 @@
 from cardano_node_tests.utils import dbsync_utils
 from cardano_node_tests.utils import helpers
 from cardano_node_tests.utils import logfiles
+from cardano_node_tests.utils.dbsync_snapshot_service import DBSyncSnapshotService
+from cardano_node_tests.utils.dbsync_snapshot_service import SnapshotFile
 from cardano_node_tests.utils.versions import VERSIONS
 
 LOGGER = logging.getLogger(__name__)
@@ -381,3 +386,49 @@ def test_epoch(self, cluster: clusterlib.ClusterLib):
         assert blocks_data_tx_count == epoch_data_tx_count, (
             f"Transactions count don't match between tables for epoch {epoch}"
         )
+
+
+class TestDBSyncSnapshot:
+    """Tests for db-sync snapshot availability and freshness."""
+
+    @pytest.fixture()
+    def db_sync_snapshots(
+        self,
+    ) -> DBSyncSnapshotService | None:
+        """Create DBSyncSnapshotService client."""
+        snapshot_service = DBSyncSnapshotService()
+        if snapshot_service is None:
+            pytest.skip("DBSyncSnapshotService is not available.")
+        return snapshot_service
+
+    @allure.link(helpers.get_vcs_link())
+    @pytest.mark.smoke
+    def test_latest_snapshot_freshness(self, db_sync_snapshots: DBSyncSnapshotService):
+        """
+        Check that the latest db-sync snapshot is not older than 5 days.
+
+        This test uses the S3 REST API to query the Cardano mainnet snapshot repository
+        and verifies that the most recent snapshot is fresh.
+        """
+        # 1. Find latest version
+        latest_version = db_sync_snapshots.get_latest_version()
+        LOGGER.info(f"Latest db-sync version: {latest_version}")
+
+        # 2. Get latest snapshot for that version
+        latest_snapshot: SnapshotFile = db_sync_snapshots.get_latest_snapshot(latest_version)
+
+        LOGGER.info(f"Latest snapshot: {latest_snapshot.name}")
+        LOGGER.info(f"Snapshot date: {latest_snapshot.last_modified.isoformat()}")
+        LOGGER.info(f"Snapshot size: {latest_snapshot.size_gb:.2f} GB")
+
+        # 3. Perform freshness check
+        five_days_ago = datetime.now(timezone.utc) - timedelta(days=5)
+
+        assert latest_snapshot.last_modified >= five_days_ago, (
+            f"The latest snapshot is too old. "
+            f"Age: {(datetime.now(timezone.utc) - latest_snapshot.last_modified).days} days. "
+            f"Snapshot date: {latest_snapshot.last_modified.strftime('%Y-%m-%d %H:%M:%S UTC')}, "
+            f"Limit: 5 days ago ({five_days_ago.strftime('%Y-%m-%d %H:%M:%S UTC')})."
+        )
+
+        LOGGER.info("Success: The latest snapshot is recent (within 5-day limit).")
diff --git a/cardano_node_tests/utils/dbsync_snapshot_service.py b/cardano_node_tests/utils/dbsync_snapshot_service.py
@@ -0,0 +1,160 @@
+import logging
+import re
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+from dataclasses import field
+from datetime import datetime
+from datetime import timezone
+from typing import List
+from typing import Tuple
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+# Define the full S3 namespace URL string outside the class for readability
+S3_NS_URL = "http://s3.amazonaws.com/doc/2006-03-01/"
+
+
+@dataclass
+class SnapshotFile:
+    """Dataclass to hold parsed snapshot file information."""
+
+    key: str
+    name: str
+    last_modified: datetime  # This will be a timezone-aware datetime object
+    size: int
+    size_gb: float = field(init=False)
+
+    def __post_init__(self) -> None:
+        self.size_gb = self.size / (1024**3)
+
+
+class DBSyncSnapshotService:
+    """Service class to interact with the Cardano DB-Sync S3 repository."""
+
+    BUCKET_URL: str = "https://update-cardano-mainnet.iohk.io"
+    ROOT_PREFIX: str = "cardano-db-sync/"
+
+    def _get_s3_objects(self, prefix: str = "", delimiter: str = "") -> bytes:
+        """Fetch XML content from the S3 bucket using REST API."""
+        params = {"list-type": "2", "prefix": prefix, "delimiter": delimiter}
+
+        response = requests.get(self.BUCKET_URL, params=params)
+        response.raise_for_status()
+        return response.content
+
+    def _parse_s3_xml(self, xml_content: bytes) -> Tuple[List[str], List[SnapshotFile]]:
+        """Parse S3 XML response using exact namespace search paths with None checks."""
+        root = ET.fromstring(xml_content)
+        ns_tag = f"{{{S3_NS_URL}}}"
+
+        # 1. Extract folders (CommonPrefixes)
+        folders = []
+        for prefix in root.findall(f".//{ns_tag}CommonPrefixes"):
+            # Safety check: find() can return None
+            prefix_tag = prefix.find(f"{ns_tag}Prefix")
+            if prefix_tag is not None and prefix_tag.text:
+                folder_path = prefix_tag.text
+                if folder_path.endswith("/"):
+                    folder_name = folder_path.strip("/").split("/")[-1]
+                    folders.append(folder_name)
+
+        # 2. Extract files (Contents)
+        files = []
+        for content in root.findall(f".//{ns_tag}Contents"):
+            # Assign elements to temporary variables
+            key_tag = content.find(f"{ns_tag}Key")
+            modified_tag = content.find(f"{ns_tag}LastModified")
+            size_tag = content.find(f"{ns_tag}Size")
+
+            if not all(
+                [
+                    key_tag is not None and key_tag.text,
+                    modified_tag is not None and modified_tag.text,
+                    size_tag is not None and size_tag.text,
+                ]
+            ):  # <--- Check tag AND tag.text for all
+                logger.warning(
+                    "Skipping malformed S3 object entry: Missing Key, LastModified, or Size."
+                )
+                continue  # Skip this entry if critical data is missing
+
+            # Use explicit variables to store the text content only if it exists
+            key_text = key_tag.text if key_tag is not None else None
+            modified_text = modified_tag.text if modified_tag is not None else None
+            size_text = size_tag.text if size_tag is not None else None
+
+            # Check 1: Ensure all three critical tags and their text content exist
+            if not all([key_text, modified_text, size_text]):
+                logger.warning(
+                    "Skipping malformed S3 object entry: Missing Key, LastModified, or Size."
+                )
+                continue  # Skip this entry if critical data is missing
+
+            # Safe extraction of text content (Pylance is now happy)
+            key = key_text
+            last_modified_str = modified_text
+            size_str = size_text
+
+            if last_modified_str is None:
+                # This should be unreachable but satisfies strict type checking.
+                continue
+
+            if key is None:
+                # This block is theoretically unreachable, but satisfies Pylance
+                continue
+
+            # Final parsing logic
+            file_date = datetime.strptime(last_modified_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
+                tzinfo=timezone.utc
+            )
+
+            files.append(
+                SnapshotFile(
+                    key=key,
+                    name=key.split("/")[-1],
+                    last_modified=file_date,
+                    size=int(size_str) if size_str else 0,
+                )
+            )
+
+        return folders, files
+
+    def get_latest_version(self) -> str:
+        """Find the numerically latest db-sync version folder."""
+        xml_content = self._get_s3_objects(prefix=self.ROOT_PREFIX, delimiter="/")
+        folders, _ = self._parse_s3_xml(xml_content)
+
+        version_folders = [f for f in folders if re.match(r"^\d+\.\d+$", f)]
+
+        if not version_folders:
+            err_msg = "No version folders found in S3 response."
+            raise RuntimeError(err_msg)
+
+        latest_version = sorted(
+            version_folders, key=lambda v: [int(part) for part in v.split(".")]
+        )[-1]
+        return latest_version
+
+    def get_latest_snapshot(self, version: str) -> SnapshotFile:
+        """Find the latest snapshot file for a given version."""
+        version_prefix = f"{self.ROOT_PREFIX}{version}/"
+        xml_content = self._get_s3_objects(prefix=version_prefix)
+        _, files = self._parse_s3_xml(xml_content)
+
+        # Filter: Revert to the original working filter (.tgz AND 'snapshot')
+        snapshot_files = [
+            f for f in files if f.name.endswith(".tgz") and "snapshot" in f.name.lower()
+        ]
+
+        if not snapshot_files:
+            file_names = [f.name for f in files]
+            logger.warning(f"Files found in S3 response for {version_prefix}: {file_names}")
+            error_msg = (
+                f"No snapshot files found for version {version}. Filtered files: {file_names}"
+            )
+            raise RuntimeError(error_msg)
+
+        latest_snapshot = max(snapshot_files, key=lambda x: x.last_modified)
+        return latest_snapshot