Skip to content

Commit db1621d

Browse files
committed
SAIL_316_snapshot_date find snapshot from given date
1 parent 259b5bf commit db1621d

File tree

2 files changed

+264
-0
lines changed

2 files changed

+264
-0
lines changed

src/utils/s3_mods.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,3 +449,32 @@ def rd_search_file(dir_path: str, ending: str) -> str:
449449
if file.endswith(ending):
450450
target_file = str(file)
451451
return target_file
452+
453+
454+
def rd_list_manifest_files(prefix: str) -> dict:
455+
"""
456+
Return manifest filenames and their last modified date with given conditions.
457+
458+
Args:
459+
prefix (str): The prefix path to search for manifest files.
460+
target_date (str): The target date in 'YYYY-MM-DD' format.
461+
462+
Raises:
463+
boto3_client.exceptions.ClientError: If there is an error accessing S3.
464+
465+
Returns:
466+
dict: A dictionary with manifest file keys and their last modified dates.
467+
"""
468+
try:
469+
manifest_files = {}
470+
paginator = s3_client.get_paginator("list_objects_v2")
471+
for page in paginator.paginate(Bucket=s3_bucket, Prefix=prefix):
472+
for obj in page.get("Contents", []):
473+
key = obj["Key"]
474+
last_modified = obj["LastModified"] # This is a datetime object
475+
if key.endswith(".mani"):
476+
manifest_files[key] = last_modified
477+
return manifest_files
478+
except s3_client.exceptions.ClientError as e:
479+
s3_logger.error(f"Error listing manifest files with prefix {prefix}: {e}")
480+
raise e
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
"""
2+
Functions to obtain information about spp snapshots from metadata files.
3+
4+
These scripts are designed to run in DAP S3 environment only.
5+
"""
6+
7+
import logging
8+
from dataclasses import dataclass
9+
from datetime import datetime
10+
11+
from src.utils.s3_mods import rd_load_json, rd_list_manifest_files
12+
13+
MetadataLogger = logging.getLogger(__name__)
14+
15+
16+
# Data classes for manifest file info and SPP metadata
17+
@dataclass
18+
class ManifestFileInfo:
19+
mani_filename: str
20+
mani_last_modified_date: datetime
21+
22+
23+
@dataclass
24+
class SPPMetadata:
25+
spp_filename: str
26+
spp_created_date: str
27+
version: int
28+
description: str = ""
29+
iterationL1: str = ""
30+
31+
32+
@dataclass
33+
class SnapshotCandidateFileInfo:
34+
mani_filename: str
35+
mani_last_modified_date: datetime
36+
spp_filename: str
37+
spp_created_date: str
38+
version: int
39+
40+
41+
def filter_manifest_files(
42+
files_dict: dict[str, datetime], target_date: str = "", wanted_str: str = ""
43+
) -> dict:
44+
"""
45+
Filter manifest files by target date and/ or filename substring.
46+
47+
Args:
48+
files_dict (dict): Dictionary of {filename: last_modified_date}.
49+
target_date (str): The target date in 'YYYY-MM-DD' format.
50+
wanted_str (str): Substring that should be present in the filename.
51+
52+
Returns:
53+
dict: Filtered dictionary with files matching the target date or substring.
54+
"""
55+
if wanted_str:
56+
filtered_files = {
57+
filename: last_mod_date
58+
for filename, last_mod_date in files_dict.items()
59+
if wanted_str in filename
60+
}
61+
if target_date:
62+
filtered_files = {
63+
filename: last_mod_date
64+
for filename, last_mod_date in files_dict.items()
65+
if last_mod_date.strftime("%Y-%m-%d") == target_date
66+
}
67+
return filtered_files
68+
69+
70+
def get_spp_file_info_from_manifest(filename: str) -> SPPMetadata:
71+
"""Get SPP file information from a manifest file as SPPMetadata."""
72+
manif_file_dict = rd_load_json(filename)
73+
metadata = SPPMetadata(
74+
spp_filename=manif_file_dict["files"][0]["name"],
75+
spp_created_date=str(manif_file_dict["tdzComplete"])[:10],
76+
version=manif_file_dict.get("version", 1),
77+
description=manif_file_dict.get("description", ""),
78+
iterationL1=manif_file_dict.get("iterationL1", ""),
79+
)
80+
return metadata
81+
82+
83+
def check_metadata(metadata: SPPMetadata, target_date: str) -> dict:
84+
"""Check if the created date in metadata matches the target date."""
85+
expected_entries = {
86+
"created_date": target_date,
87+
"description": "SPP BERD snapshot files",
88+
"iterationL1": "spp_snapshots",
89+
}
90+
error_dict = {
91+
k: v for k, v in expected_entries.items() if getattr(metadata, k, None) != v
92+
}
93+
if error_dict:
94+
msg = (
95+
f"Metadata check failed for date {target_date}. "
96+
f"Mismatched entries: {error_dict}"
97+
)
98+
MetadataLogger.error(msg)
99+
return error_dict
100+
return {}
101+
102+
103+
def check_manifest_files_for_date(
104+
files_dict: dict, target_date: str, wanted_str: str
105+
) -> dict:
106+
"""Check manifest files for a specific target date.
107+
108+
Args:
109+
files_dict (dict): {filename: last_modified_date}
110+
target_date (str): The target date in 'YYYY-MM-DD' format.
111+
112+
Returns:
113+
dict: Filtered dictionary with files matching the target date.
114+
"""
115+
correct_date_files = {}
116+
for filename, last_mod_date in files_dict.items():
117+
if last_mod_date.strftime("%Y-%m-%d") == target_date:
118+
# get all the mainfest files with the correct substring
119+
new_files_dict = filter_manifest_files(files_dict, wanted_str=wanted_str)
120+
# read each manifest file and check the tdzComplete date
121+
for filename in new_files_dict.keys():
122+
manif_file_dict = rd_load_json(filename)
123+
created_date = manif_file_dict["tdzComplete"]
124+
if created_date.startswith(target_date):
125+
correct_date_files[filename] = target_date
126+
file_version = manif_file_dict.get("version", "")
127+
if file_version > 1:
128+
correct_date_files[filename] = f"{target_date}_v{file_version}"
129+
130+
131+
def test_filtered_files(
132+
files_dict: dict, target_date: str
133+
) -> list[SnapshotCandidateFileInfo]:
134+
"""Test filtered files for metadata matching the target date and metadata validity.
135+
Args:
136+
files_dict (dict): {filename: last_modified_date}
137+
target_date (str): The target date in 'YYYY-MM-DD' format.
138+
139+
Returns:
140+
dict: Dict with candidate file information if metadata matches target date.
141+
"""
142+
candidate_file_list = []
143+
for filename, last_mod_date in files_dict.items():
144+
metadata_dict = get_spp_file_info_from_manifest(filename)
145+
error_dict = check_metadata(metadata_dict, target_date)
146+
if not error_dict:
147+
# create a CandidateFileInfo object and add to dict
148+
new_candidate = SnapshotCandidateFileInfo(
149+
mani_filename=filename,
150+
mani_last_modified_date=last_mod_date,
151+
spp_filename=metadata_dict.spp_filename,
152+
spp_created_date=metadata_dict.spp_created_date,
153+
version=metadata_dict.version,
154+
)
155+
candidate_file_list.append(new_candidate)
156+
return candidate_file_list
157+
158+
159+
def get_most_recent_file(file_list: list[SnapshotCandidateFileInfo]) -> str:
160+
"""Get the filename of the most recently modified file from a dictionary.
161+
162+
Args:
163+
file_dict (dict): {filename: last_modified_date}
164+
165+
Returns:
166+
str: The filename (key) of the most recently modified file.
167+
"""
168+
if len(file_list) > 1:
169+
# sort the list by last modified date and get the most recent
170+
sorted_files = sorted(
171+
file_list, key=lambda x: x.mani_last_modified_date, reverse=True
172+
)
173+
most_recent_file = sorted_files[0].spp_filename
174+
else:
175+
most_recent_file = file_list[0].spp_filename
176+
return most_recent_file
177+
178+
179+
def get_lastest_version_file(
180+
file_list: list[SnapshotCandidateFileInfo],
181+
) -> tuple[str, int]:
182+
"""Get filename of the highest version file from a list where filenames the same.
183+
184+
Args:
185+
file_list (list): List of SnapshotCandidateFileInfo objects.
186+
187+
Returns:
188+
tuple: The filename of the highest version file and its version number.
189+
"""
190+
if len(file_list) > 1:
191+
# sort the list by version and get the highest version
192+
sorted_files = sorted(file_list, key=lambda x: x.version, reverse=True)
193+
highest_version_file = sorted_files[0].spp_filename
194+
corresponding_version = sorted_files[0].version
195+
else:
196+
highest_version_file = file_list[0].spp_filename
197+
corresponding_version = file_list[0].version
198+
return highest_version_file, corresponding_version
199+
200+
201+
def get_snapshot_name_from_date(
202+
prefix: str, survey_year: str, spp_date: str
203+
) -> tuple[str, int]:
204+
"""Return the name of an spp snapshot delivered on the given date.
205+
206+
Args:
207+
survey_year (str): The survey year the snapshot belongs to.
208+
spp_date (str): The date the snapshot was last modified (YYYY-MM-DD).
209+
210+
Returns:
211+
str: The name of the required spp snapshot.
212+
"""
213+
# get a dictionary of all manifest files with the given prefix
214+
files_dict = rd_list_manifest_files(prefix)
215+
# filter the manifest files for the given date and wanted string
216+
wanted_str = f"snapshot-{survey_year}12-002-"
217+
files_dict = filter_manifest_files(files_dict, spp_date, wanted_str)
218+
# check filtered files for metadata matching the target date and metadata validity
219+
candidate_file_dict = test_filtered_files(files_dict, spp_date)
220+
# if no candidate files are found checking the last modified date, search all files
221+
if not candidate_file_dict:
222+
files_dict = filter_manifest_files(files_dict, wanted_str=wanted_str)
223+
candidate_file_dict = test_filtered_files(files_dict, spp_date)
224+
if not candidate_file_dict:
225+
MetadataLogger.error(f"No valid SPP snapshot found for date {spp_date}")
226+
raise ValueError(f"No valid SPP snapshot found for date {spp_date}")
227+
# find the most recent file if multiple candidates are found
228+
snapshot_name = get_most_recent_file(candidate_file_dict)
229+
# check whether snapshot filename is unique, if not get the highest version
230+
duplicate_files = [
231+
file for file in candidate_file_dict if file.spp_filename == snapshot_name
232+
]
233+
if len(duplicate_files) > 1:
234+
snapshot_name, version = get_lastest_version_file(duplicate_files)
235+
return snapshot_name, version

0 commit comments

Comments
 (0)