Skip to content

Commit 34b7d9f

Browse files
committed
simplify data classes and streamline code
1 parent 07c6248 commit 34b7d9f

File tree

2 files changed

+70
-92
lines changed

2 files changed

+70
-92
lines changed

src/utils/s3_mods.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -453,11 +453,10 @@ def rd_search_file(dir_path: str, ending: str) -> str:
453453

454454
def rd_list_manifest_files(prefix: str) -> dict:
455455
"""
456-
Return manifest filenames and their last modified date with given conditions.
456+
Return manifest filenames and their last modified date within a path set by prefix.
457457
458458
Args:
459459
prefix (str): The prefix path to search for manifest files.
460-
target_date (str): The target date in 'YYYY-MM-DD' format.
461460
462461
Raises:
463462
boto3_client.exceptions.ClientError: If there is an error accessing S3.
Lines changed: 69 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
"""
2-
Functions to obtain information about spp snapshots from metadata files.
2+
Functions to obtain information about spp snapshots from metadata manifest files.
33
44
These scripts are designed to run in DAP S3 environment only.
55
"""
66

77
import logging
8+
import os
89
from dataclasses import dataclass
910
from datetime import datetime
1011

@@ -13,45 +14,30 @@
1314
MetadataLogger = logging.getLogger(__name__)
1415

1516

16-
# Data classes for manifest file info and SPP metadata
17-
@dataclass
18-
class ManifestFileInfo:
19-
mani_filename: str
20-
mani_last_modified_date: datetime
21-
22-
2317
@dataclass
2418
class SPPMetadata:
2519
# include default values to check against
26-
spp_filename: str = ""
27-
spp_created_date: str = ""
28-
version: int = 1
29-
description: str = "SPP BERD snapshot files"
30-
iterationL1: str = "spp_snapshots"
31-
32-
33-
@dataclass
34-
class SnapshotCandidateFileInfo:
3520
mani_filename: str
3621
mani_last_modified_date: datetime
3722
spp_filename: str
3823
spp_created_date: str
3924
version: int
25+
description: str = "SPP BERD snapshot files"
26+
iterationL1: str = "spp_snapshots"
4027

4128

4229
def filter_manifest_files(
4330
files_dict: dict[str, datetime], target_date: str = "", wanted_str: str = ""
44-
) -> list[ManifestFileInfo]:
45-
"""
46-
Filter manifest files by target date and/ or filename substring.
31+
) -> dict[str, datetime]:
32+
"""Filter manifest files by target date and/ or filename substring.
4733
4834
Args:
4935
files_dict (dict): Dictionary of {filename: last_modified_date}.
5036
target_date (str): The target date in 'YYYY-MM-DD' format.
5137
wanted_str (str): Substring that should be present in the filename.
5238
5339
Returns:
54-
dict: Filtered dictionary with files matching the target date or substring.
40+
dict: Filtered dictionary of {filename: last_modified_date}.
5541
"""
5642
if wanted_str:
5743
filtered_files = {
@@ -65,17 +51,28 @@ def filter_manifest_files(
6551
for filename, last_mod_date in files_dict.items()
6652
if last_mod_date.strftime("%Y-%m-%d") == target_date
6753
}
68-
manifest_list = [
69-
ManifestFileInfo(mani_filename=fn, mani_last_modified_date=lm)
70-
for fn, lm in filtered_files.items()
71-
]
72-
return manifest_list
54+
return filtered_files
7355

7456

75-
def get_spp_file_info_from_manifest(filename: str) -> SPPMetadata:
76-
"""Get SPP file information from a manifest file as SPPMetadata."""
77-
manif_file_dict = rd_load_json(filename)
57+
def get_spp_file_info_from_manifest(
58+
mani_filename: str, mani_last_modified_date: datetime
59+
) -> SPPMetadata:
60+
"""Get SPP file information from a manifest file as SPPMetadata.
61+
62+
Required fields are stored in a SPPMetadata data class for clarity.
63+
64+
Args:
65+
mani_filename (str): The manifest filename.
66+
mani_last_modified_date (datetime): The last modified date of the manifest file.
67+
68+
Returns:
69+
SPPMetadata: The SPPMetadata object with information from the manifest.
70+
"""
71+
manif_file_dict = rd_load_json(mani_filename)
72+
7873
metadata = SPPMetadata(
74+
mani_filename=mani_filename,
75+
mani_last_modified_date=mani_last_modified_date,
7976
spp_filename=manif_file_dict["files"][0]["name"],
8077
spp_created_date=str(manif_file_dict["tdzComplete"])[:10],
8178
version=manif_file_dict.get("version", 1),
@@ -88,8 +85,7 @@ def get_spp_file_info_from_manifest(filename: str) -> SPPMetadata:
8885
def check_metadata(metadata: SPPMetadata, target_date: str) -> dict:
8986
"""Check the created date in metadata matches the target date and other validation.
9087
91-
As well as checking for the created date, also checks description and iterationL1
92-
fields to ensure data is as expected.
88+
Also check description and iterationL1 fields to ensure data is as expected.
9389
9490
Args:
9591
metadata (SPPMetadata): The SPPMetadata object to check.
@@ -98,15 +94,23 @@ def check_metadata(metadata: SPPMetadata, target_date: str) -> dict:
9894
Returns:
9995
dict: Dict of mismatched metadata entries, empty if all match.
10096
"""
101-
# create a SPPMetadata object with expected entries. The entries not specified will
102-
# be checked against defaults set.
97+
# where no checks are needed, set expected fields to actual values
98+
exp_version = metadata.version
99+
exp_mani_filename = metadata.mani_filename
100+
exp_mani_last_modified_date = metadata.mani_last_modified_date
101+
# remove the path and .mani extension from the filename checking
102+
exp_spp_filename = os.path.basename(exp_mani_filename).replace(".mani", "")
103103
if target_date == "":
104-
target_date = metadata.spp_created_date # no check needed
104+
target_date = metadata.spp_created_date
105105

106+
# Create an expected SPPMetadata object for checks.
106107
expected_metadata = SPPMetadata(
107-
spp_filename=metadata.spp_filename, # no check needed
108-
spp_created_date=target_date, # check created matches target date
109-
version=metadata.version, # no check needed
108+
mani_filename=exp_mani_filename,
109+
mani_last_modified_date=exp_mani_last_modified_date,
110+
spp_filename=exp_spp_filename,
111+
spp_created_date=target_date,
112+
version=exp_version,
113+
# the remaining fields will be checked against their default values
110114
)
111115

112116
error_dict = {
@@ -124,26 +128,24 @@ def check_metadata(metadata: SPPMetadata, target_date: str) -> dict:
124128
return {}
125129

126130

127-
def check_files(
128-
mani_files_list: list[ManifestFileInfo], target_date: str = ""
129-
) -> list[SnapshotCandidateFileInfo]:
130-
"""Test filtered files for metadata matching the target date and metadata validity.
131+
def check_files(mani_files_dict: dict, target_date: str = "") -> list[SPPMetadata]:
132+
"""Check filtered manifest file dict for metadata validity and (optionally) date.
133+
131134
Args:
132-
files_dict (dict): {filename: last_modified_date}
135+
mani_files_dict (dict): {manifest_filename: last_modified_date}
133136
target_date (str): Target date in 'YYYY-MM-DD' format. If empty, no date check.
134137
135138
Returns:
136-
dict: Dict with candidate file information if metadata matches target date.
139+
list: List with candidate file information if metadata matches target date.
137140
"""
138141
candidate_file_list = []
139-
for mani_file_info in mani_files_list:
140-
spp_metadata = get_spp_file_info_from_manifest(mani_file_info.mani_filename)
142+
for mani_filename, mani_mod_date in mani_files_dict.items():
143+
spp_metadata = get_spp_file_info_from_manifest(mani_filename, mani_mod_date)
141144
error_dict = check_metadata(spp_metadata, target_date)
142145
if not error_dict:
143-
# create a CandidateFileInfo object and add to dict
144-
new_candidate = SnapshotCandidateFileInfo(
145-
mani_filename=mani_file_info.mani_filename,
146-
mani_last_modified_date=mani_file_info.mani_last_modified_date,
146+
new_candidate = SPPMetadata(
147+
mani_filename=mani_filename,
148+
mani_last_modified_date=mani_mod_date,
147149
spp_filename=spp_metadata.spp_filename,
148150
spp_created_date=spp_metadata.spp_created_date,
149151
version=spp_metadata.version,
@@ -152,7 +154,7 @@ def check_files(
152154
return candidate_file_list
153155

154156

155-
def get_most_recent_file(file_list: list[SnapshotCandidateFileInfo]) -> str:
157+
def get_most_recent_file(file_list: list[SPPMetadata]) -> str:
156158
"""Get the filename of the most recently modified file from a dictionary.
157159
158160
Args:
@@ -173,7 +175,7 @@ def get_most_recent_file(file_list: list[SnapshotCandidateFileInfo]) -> str:
173175

174176

175177
def get_lastest_version_file(
176-
file_list: list[SnapshotCandidateFileInfo],
178+
file_list: list[SPPMetadata],
177179
) -> tuple[str, int, str]:
178180
"""Get filename of the highest version file from a list where filenames the same.
179181
@@ -184,22 +186,22 @@ def get_lastest_version_file(
184186
tuple: The filename of the highest version file and its version number.
185187
"""
186188
if len(file_list) > 1:
187-
# sort the list by version and get the highest version
188-
sorted_files = sorted(file_list, key=lambda x: x.version, reverse=True)
189-
highest_version_file = sorted_files[0].spp_filename
190-
corresponding_version = sorted_files[0].version
191-
corresponding_created_date = sorted_files[0].spp_created_date
192-
else:
193-
highest_version_file = file_list[0].spp_filename
194-
corresponding_version = file_list[0].version
195-
corresponding_created_date = file_list[0].spp_created_date
189+
# sort the list by version and get the highest version unless only one file
190+
file_list = sorted(file_list, key=lambda x: x.version, reverse=True)
191+
highest_version_file = file_list[0].spp_filename
192+
corresponding_version = file_list[0].version
193+
corresponding_created_date = file_list[0].spp_created_date
194+
196195
return highest_version_file, corresponding_version, corresponding_created_date
197196

198197

199-
def get_snapshot_name_from_date(
200-
prefix: str, survey_year: str, spp_date: str
198+
def get_snapshot_name(
199+
prefix: str, survey_year: str, spp_date: str = ""
201200
) -> tuple[str, int, str]:
202-
"""Return the name of an spp snapshot delivered on the given date.
201+
"""Return the name, version and created date of an spp snapshot.
202+
203+
Optionally check first against a target date. The latest version is returned
204+
if multiple files match the criteria.
203205
204206
Args:
205207
survey_year (str): The survey year the snapshot belongs to.
@@ -215,8 +217,10 @@ def get_snapshot_name_from_date(
215217
filtered_mani_file_list = filter_manifest_files(files_dict, spp_date, wanted_str)
216218
# check filtered files for metadata matching the target date and metadata validity
217219
candidate_file_list = check_files(filtered_mani_file_list, spp_date)
218-
# if no candidate files are found checking the last modified date, search all files
219-
if not candidate_file_list:
220+
221+
# if we checked against a target date but no candidate files are found,
222+
# try filtering only by wanted string
223+
if not (spp_date == "") and not candidate_file_list:
220224
files_dict = filter_manifest_files(files_dict, wanted_str=wanted_str)
221225
candidate_file_list = check_files(filtered_mani_file_list, spp_date)
222226
if not candidate_file_list:
@@ -230,28 +234,3 @@ def get_snapshot_name_from_date(
230234
]
231235
snapshot_name, version, created_date = get_lastest_version_file(duplicate_files)
232236
return snapshot_name, version, created_date
233-
234-
235-
def get_latest_snapshot_name(prefix: str, survey_year: str) -> tuple[str, int, str]:
236-
"""Return the name of the latest spp snapshot for a given survey year.
237-
238-
Args:
239-
survey_year (str): The survey year the snapshot belongs to.
240-
241-
Returns:
242-
tuple: The name of the latest spp snapshot, its version, and created date.
243-
"""
244-
# get a dictionary of all manifest files with the given prefix
245-
files_dict = rd_list_manifest_files(prefix)
246-
# filter the manifest files for the wanted string
247-
wanted_str = f"snapshot-{survey_year}12-002-"
248-
filtered_mani_file_list = filter_manifest_files(files_dict, wanted_str=wanted_str)
249-
candidate_file_list = check_files(filtered_mani_file_list, target_date="")
250-
# check filtered files for metadata validity
251-
snapshot_name = get_most_recent_file(candidate_file_list)
252-
# check whether snapshot filename is unique, if not get the highest version
253-
duplicate_files = [
254-
file for file in candidate_file_list if file.spp_filename == snapshot_name
255-
]
256-
snapshot_name, version, created_date = get_lastest_version_file(duplicate_files)
257-
return snapshot_name, version, created_date

0 commit comments

Comments
 (0)