Skip to content

Commit dd09cb2

Browse files
feat(dbsync): Introduce DBSyncSnapshotService and snapshot freshness test
Introduces new components to cleanly implement the Cardano DB-Sync snapshot freshness check using the S3 REST API. * **DBSyncSnapshotService:** A new service class responsible for interacting with the IOHK S3 repository, encapsulating API calls and robust XML parsing logic (including handling S3 namespaces and date formats). * **Clarity:** Uses `dataclass` for snapshot metadata and standard Pytest best practices.
1 parent d2ffdf3 commit dd09cb2

File tree

2 files changed

+211
-0
lines changed

2 files changed

+211
-0
lines changed

cardano_node_tests/tests/test_dbsync.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import logging
44
import time
55
import typing as tp
6+
from datetime import datetime
7+
from datetime import timedelta
8+
from datetime import timezone
69

710
import allure
811
import pytest
@@ -18,6 +21,8 @@
1821
from cardano_node_tests.utils import dbsync_utils
1922
from cardano_node_tests.utils import helpers
2023
from cardano_node_tests.utils import logfiles
24+
from cardano_node_tests.utils.dbsync_snapshot_service import DBSyncSnapshotService
25+
from cardano_node_tests.utils.dbsync_snapshot_service import SnapshotFile
2126
from cardano_node_tests.utils.versions import VERSIONS
2227

2328
LOGGER = logging.getLogger(__name__)
@@ -381,3 +386,49 @@ def test_epoch(self, cluster: clusterlib.ClusterLib):
381386
assert blocks_data_tx_count == epoch_data_tx_count, (
382387
f"Transactions count don't match between tables for epoch {epoch}"
383388
)
389+
390+
391+
class TestDBSyncSnapshot:
392+
"""Tests for db-sync snapshot availability and freshness."""
393+
394+
@pytest.fixture()
395+
def db_sync_snapshots(
396+
self,
397+
) -> DBSyncSnapshotService | None:
398+
"""Create DBSyncSnapshotService client."""
399+
snapshot_service = DBSyncSnapshotService()
400+
if snapshot_service is None:
401+
pytest.skip("DBSyncSnapshotService is not available.")
402+
return snapshot_service
403+
404+
@allure.link(helpers.get_vcs_link())
405+
@pytest.mark.smoke
406+
def test_latest_snapshot_freshness(self, db_sync_snapshots: DBSyncSnapshotService):
407+
"""
408+
Check that the latest db-sync snapshot is not older than 5 days.
409+
410+
This test uses the S3 REST API to query the Cardano mainnet snapshot repository
411+
and verifies that the most recent snapshot is fresh.
412+
"""
413+
# 1. Find latest version
414+
latest_version = db_sync_snapshots.get_latest_version()
415+
LOGGER.info(f"Latest db-sync version: {latest_version}")
416+
417+
# 2. Get latest snapshot for that version
418+
latest_snapshot: SnapshotFile = db_sync_snapshots.get_latest_snapshot(latest_version)
419+
420+
LOGGER.info(f"Latest snapshot: {latest_snapshot.name}")
421+
LOGGER.info(f"Snapshot date: {latest_snapshot.last_modified.isoformat()}")
422+
LOGGER.info(f"Snapshot size: {latest_snapshot.size_gb:.2f} GB")
423+
424+
# 3. Perform freshness check
425+
five_days_ago = datetime.now(timezone.utc) - timedelta(days=5)
426+
427+
assert latest_snapshot.last_modified >= five_days_ago, (
428+
f"The latest snapshot is too old. "
429+
f"Age: {(datetime.now(timezone.utc) - latest_snapshot.last_modified).days} days. "
430+
f"Snapshot date: {latest_snapshot.last_modified.strftime('%Y-%m-%d %H:%M:%S UTC')}, "
431+
f"Limit: 5 days ago ({five_days_ago.strftime('%Y-%m-%d %H:%M:%S UTC')})."
432+
)
433+
434+
LOGGER.info("Success: The latest snapshot is recent (within 5-day limit).")
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
import logging
2+
import re
3+
import xml.etree.ElementTree as ET
4+
from dataclasses import dataclass
5+
from dataclasses import field
6+
from datetime import datetime
7+
from datetime import timezone
8+
from typing import List
9+
from typing import Tuple
10+
11+
import requests
12+
13+
logger = logging.getLogger(__name__)
14+
15+
# Define the full S3 namespace URL string outside the class for readability
16+
S3_NS_URL = "http://s3.amazonaws.com/doc/2006-03-01/"
17+
18+
19+
@dataclass
20+
class SnapshotFile:
21+
"""Dataclass to hold parsed snapshot file information."""
22+
23+
key: str
24+
name: str
25+
last_modified: datetime # This will be a timezone-aware datetime object
26+
size: int
27+
size_gb: float = field(init=False)
28+
29+
def __post_init__(self) -> None:
30+
self.size_gb = self.size / (1024**3)
31+
32+
33+
class DBSyncSnapshotService:
34+
"""Service class to interact with the Cardano DB-Sync S3 repository."""
35+
36+
BUCKET_URL: str = "https://update-cardano-mainnet.iohk.io"
37+
ROOT_PREFIX: str = "cardano-db-sync/"
38+
39+
def _get_s3_objects(self, prefix: str = "", delimiter: str = "") -> bytes:
40+
"""Fetch XML content from the S3 bucket using REST API."""
41+
params = {"list-type": "2", "prefix": prefix, "delimiter": delimiter}
42+
43+
response = requests.get(self.BUCKET_URL, params=params)
44+
response.raise_for_status()
45+
return response.content
46+
47+
def _parse_s3_xml(self, xml_content: bytes) -> Tuple[List[str], List[SnapshotFile]]:
48+
"""Parse S3 XML response using exact namespace search paths with None checks."""
49+
root = ET.fromstring(xml_content)
50+
ns_tag = f"{{{S3_NS_URL}}}"
51+
52+
# 1. Extract folders (CommonPrefixes)
53+
folders = []
54+
for prefix in root.findall(f".//{ns_tag}CommonPrefixes"):
55+
# Safety check: find() can return None
56+
prefix_tag = prefix.find(f"{ns_tag}Prefix")
57+
if prefix_tag is not None and prefix_tag.text:
58+
folder_path = prefix_tag.text
59+
if folder_path.endswith("/"):
60+
folder_name = folder_path.strip("/").split("/")[-1]
61+
folders.append(folder_name)
62+
63+
# 2. Extract files (Contents)
64+
files = []
65+
for content in root.findall(f".//{ns_tag}Contents"):
66+
# Assign elements to temporary variables
67+
key_tag = content.find(f"{ns_tag}Key")
68+
modified_tag = content.find(f"{ns_tag}LastModified")
69+
size_tag = content.find(f"{ns_tag}Size")
70+
71+
if not all(
72+
[
73+
key_tag is not None and key_tag.text,
74+
modified_tag is not None and modified_tag.text,
75+
size_tag is not None and size_tag.text,
76+
]
77+
): # <--- Check tag AND tag.text for all
78+
logger.warning(
79+
"Skipping malformed S3 object entry: Missing Key, LastModified, or Size."
80+
)
81+
continue # Skip this entry if critical data is missing
82+
83+
# Use explicit variables to store the text content only if it exists
84+
key_text = key_tag.text if key_tag is not None else None
85+
modified_text = modified_tag.text if modified_tag is not None else None
86+
size_text = size_tag.text if size_tag is not None else None
87+
88+
# Check 1: Ensure all three critical tags and their text content exist
89+
if not all([key_text, modified_text, size_text]):
90+
logger.warning(
91+
"Skipping malformed S3 object entry: Missing Key, LastModified, or Size."
92+
)
93+
continue # Skip this entry if critical data is missing
94+
95+
# Safe extraction of text content (Pylance is now happy)
96+
key = key_text
97+
last_modified_str = modified_text
98+
size_str = size_text
99+
100+
if last_modified_str is None:
101+
# This should be unreachable but satisfies strict type checking.
102+
continue
103+
104+
if key is None:
105+
# This block is theoretically unreachable, but satisfies Pylance
106+
continue
107+
108+
# Final parsing logic
109+
file_date = datetime.strptime(last_modified_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(
110+
tzinfo=timezone.utc
111+
)
112+
113+
files.append(
114+
SnapshotFile(
115+
key=key,
116+
name=key.split("/")[-1],
117+
last_modified=file_date,
118+
size=int(size_str) if size_str else 0,
119+
)
120+
)
121+
122+
return folders, files
123+
124+
def get_latest_version(self) -> str:
125+
"""Find the numerically latest db-sync version folder."""
126+
xml_content = self._get_s3_objects(prefix=self.ROOT_PREFIX, delimiter="/")
127+
folders, _ = self._parse_s3_xml(xml_content)
128+
129+
version_folders = [f for f in folders if re.match(r"^\d+\.\d+$", f)]
130+
131+
if not version_folders:
132+
err_msg = "No version folders found in S3 response."
133+
raise RuntimeError(err_msg)
134+
135+
latest_version = sorted(
136+
version_folders, key=lambda v: [int(part) for part in v.split(".")]
137+
)[-1]
138+
return latest_version
139+
140+
def get_latest_snapshot(self, version: str) -> SnapshotFile:
141+
"""Find the latest snapshot file for a given version."""
142+
version_prefix = f"{self.ROOT_PREFIX}{version}/"
143+
xml_content = self._get_s3_objects(prefix=version_prefix)
144+
_, files = self._parse_s3_xml(xml_content)
145+
146+
# Filter: Revert to the original working filter (.tgz AND 'snapshot')
147+
snapshot_files = [
148+
f for f in files if f.name.endswith(".tgz") and "snapshot" in f.name.lower()
149+
]
150+
151+
if not snapshot_files:
152+
file_names = [f.name for f in files]
153+
logger.warning(f"Files found in S3 response for {version_prefix}: {file_names}")
154+
error_msg = (
155+
f"No snapshot files found for version {version}. Filtered files: {file_names}"
156+
)
157+
raise RuntimeError(error_msg)
158+
159+
latest_snapshot = max(snapshot_files, key=lambda x: x.last_modified)
160+
return latest_snapshot

0 commit comments

Comments
 (0)