Skip to content

Commit 28ffb6a

Browse files
committed
correct key for spp create date
1 parent 34b7d9f commit 28ffb6a

File tree

3 files changed

+84
-16
lines changed

3 files changed

+84
-16
lines changed

src/utils/s3_mods.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,8 @@ def rd_list_manifest_files(prefix: str) -> dict:
462462
boto3_client.exceptions.ClientError: If there is an error accessing S3.
463463
464464
Returns:
465-
dict: A dictionary with manifest file keys and their last modified dates.
465+
dict: A dictionary with manifest file keys and their last modified dates, sorted
466+
by last modified date in descending order.
466467
"""
467468
try:
468469
manifest_files = {}
@@ -473,7 +474,13 @@ def rd_list_manifest_files(prefix: str) -> dict:
473474
last_modified = obj["LastModified"] # This is a datetime object
474475
if key.endswith(".mani"):
475476
manifest_files[key] = last_modified
476-
return manifest_files
477+
478+
# Use lambda to sort dictionary items by their values (last_modified_date)
479+
sorted_manifest_files = dict(
480+
sorted(manifest_files.items(), key=lambda item: item[1], reverse=True)
481+
)
482+
return sorted_manifest_files
483+
477484
except s3_client.exceptions.ClientError as e:
478485
s3_logger.error(f"Error listing manifest files with prefix {prefix}: {e}")
479486
raise e
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
Main Goal:
2+
- Find the correct SPP snapshot file from S3 manifest metadata, based on date, version,
3+
and filename pattern.
4+
5+
1. Get all manifest files from S3 with a given prefix (filepath).
6+
- Use S3 client to list all files ending with ".mani" under the prefix.
7+
- Store as {filename: last_modified_date}, sorted by date descending.
8+
9+
2. Filter manifest files by substring
10+
- Use substring search to obtain files relating to the correct survey and survey year
11+
12+
3. If required, get the most recent modified date and check files modified this date.
13+
- Read metadata for all files modified this date
14+
- Check whether the spp created date is the same for each file
15+
- Check whether other metadata is valid for each file
16+
- Create a list of candidate files with their metadata
17+
18+
4. If a specified date was given, check all files modified this date.
19+
- Read metadata for all files modified this date
20+
- Check whether the spp created date is the same date for each file
21+
- Check whether other metadata is valid for each file
22+
- Create a list of candidate files with their metadata
23+
24+
5. If the created date for a file is earlier than it's modified date (it can't be later)
25+
- Save the created date in a "most recent created" variable
26+
- Continue the loop through files to the next most recent modified file and onwards
27+
- If a created date is found that is more recent than that in the "most recent created" varible, and the modified date is the same, that is the candidate.
28+
- When the next most recent modified file is modified after the "most recent
29+
created" date, the "most recent created" date is the candidate.
30+
- All files created on a "most recent created" date are candidates.
31+
32+
6. From candidate files:
33+
- If multiple, select the most recent by last_modified_date.
34+
- If multiple with same filename, select the one with the highest version.
35+
36+
7. Return the chosen snapshot filename, version, and created date.
37+
- Conditionally return file info for the most recent date
38+
- Conditionally return file info for specified date
39+
40+
Helper Functions:
41+
- filter_manifest_files(files_dict, target_date, wanted_str)
42+
- get_spp_file_info_from_manifest(mani_filename, mani_last_modified_date)
43+
- check_metadata(metadata, target_date)
44+
- check_files(mani_files_dict, target_date)
45+
- get_most_recent_file(file_list)
46+
- get_lastest_version_file(file_list)

src/utils/snapshot_metadata_utils.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
class SPPMetadata:
1919
# include default values to check against
2020
mani_filename: str
21-
mani_last_modified_date: datetime
21+
mani_last_modified_date: str
2222
spp_filename: str
2323
spp_created_date: str
2424
version: int
@@ -28,7 +28,7 @@ class SPPMetadata:
2828

2929
def filter_manifest_files(
3030
files_dict: dict[str, datetime], target_date: str = "", wanted_str: str = ""
31-
) -> dict[str, datetime]:
31+
) -> dict[str, str]:
3232
"""Filter manifest files by target date and/ or filename substring.
3333
3434
Args:
@@ -41,29 +41,29 @@ def filter_manifest_files(
4141
"""
4242
if wanted_str:
4343
filtered_files = {
44-
filename: last_mod_date
44+
filename: last_mod_date.strftime("%Y-%m-%d")
4545
for filename, last_mod_date in files_dict.items()
4646
if wanted_str in filename
4747
}
4848
if target_date:
4949
filtered_files = {
50-
filename: last_mod_date
50+
filename: last_mod_date.strftime("%Y-%m-%d")
5151
for filename, last_mod_date in files_dict.items()
5252
if last_mod_date.strftime("%Y-%m-%d") == target_date
5353
}
5454
return filtered_files
5555

5656

5757
def get_spp_file_info_from_manifest(
58-
mani_filename: str, mani_last_modified_date: datetime
58+
mani_filename: str, mani_last_modified_date: str
5959
) -> SPPMetadata:
6060
"""Get SPP file information from a manifest file as SPPMetadata.
6161
6262
Required fields are stored in a SPPMetadata data class for clarity.
6363
6464
Args:
6565
mani_filename (str): The manifest filename.
66-
mani_last_modified_date (datetime): The last modified date of the manifest file.
66+
mani_last_modified_date (str): The last modified date of the manifest file.
6767
6868
Returns:
6969
SPPMetadata: The SPPMetadata object with information from the manifest.
@@ -74,7 +74,7 @@ def get_spp_file_info_from_manifest(
7474
mani_filename=mani_filename,
7575
mani_last_modified_date=mani_last_modified_date,
7676
spp_filename=manif_file_dict["files"][0]["name"],
77-
spp_created_date=str(manif_file_dict["tdzComplete"])[:10],
77+
spp_created_date=str(manif_file_dict["files"][0]["scanFileUploadTime"])[:10],
7878
version=manif_file_dict.get("version", 1),
7979
description=manif_file_dict.get("description", ""),
8080
iterationL1=manif_file_dict.get("iterationL1", ""),
@@ -98,10 +98,12 @@ def check_metadata(metadata: SPPMetadata, target_date: str) -> dict:
9898
exp_version = metadata.version
9999
exp_mani_filename = metadata.mani_filename
100100
exp_mani_last_modified_date = metadata.mani_last_modified_date
101-
# remove the path and .mani extension from the filename checking
101+
# remove the path and .mani extension from the manifest filename for the spp name
102102
exp_spp_filename = os.path.basename(exp_mani_filename).replace(".mani", "")
103+
# if no target date is given, check whether the created date is the same as
104+
# the manifest last modified date
103105
if target_date == "":
104-
target_date = metadata.spp_created_date
106+
target_date = metadata.mani_last_modified_date
105107

106108
# Create an expected SPPMetadata object for checks.
107109
expected_metadata = SPPMetadata(
@@ -114,13 +116,13 @@ def check_metadata(metadata: SPPMetadata, target_date: str) -> dict:
114116
)
115117

116118
error_dict = {
117-
k: v
119+
k: {"expected": v, "found": getattr(metadata, k, None)}
118120
for k, v in expected_metadata.__dict__.items()
119121
if getattr(metadata, k, None) != v
120122
}
121123
if error_dict:
122124
msg = (
123-
f"Metadata check failed for date {target_date}. "
125+
f"Metadata check failed for target date {target_date}. "
124126
f"Mismatched entries: {error_dict}"
125127
)
126128
MetadataLogger.error(msg)
@@ -131,6 +133,14 @@ def check_metadata(metadata: SPPMetadata, target_date: str) -> dict:
131133
def check_files(mani_files_dict: dict, target_date: str = "") -> list[SPPMetadata]:
132134
"""Check filtered manifest file dict for metadata validity and (optionally) date.
133135
136+
If a target date is provided, the files dict only contain files from that date.
137+
138+
If no target date is provided, then the function will check the newest files first
139+
until a valid file is found (the dictionary has been sorted by last modified date).
140+
141+
However, if the `spp_created_date` is earlier than the modified date, the function
142+
will continue checking further files to find the latest creation date.
143+
134144
Args:
135145
mani_files_dict (dict): {manifest_filename: last_modified_date}
136146
target_date (str): Target date in 'YYYY-MM-DD' format. If empty, no date check.
@@ -142,6 +152,7 @@ def check_files(mani_files_dict: dict, target_date: str = "") -> list[SPPMetadat
142152
for mani_filename, mani_mod_date in mani_files_dict.items():
143153
spp_metadata = get_spp_file_info_from_manifest(mani_filename, mani_mod_date)
144154
error_dict = check_metadata(spp_metadata, target_date)
155+
# TODO: if the only error is created date < modified date, continue checking-
145156
if not error_dict:
146157
new_candidate = SPPMetadata(
147158
mani_filename=mani_filename,
@@ -151,6 +162,7 @@ def check_files(mani_files_dict: dict, target_date: str = "") -> list[SPPMetadat
151162
version=spp_metadata.version,
152163
)
153164
candidate_file_list.append(new_candidate)
165+
154166
return candidate_file_list
155167

156168

@@ -200,8 +212,11 @@ def get_snapshot_name(
200212
) -> tuple[str, int, str]:
201213
"""Return the name, version and created date of an spp snapshot.
202214
203-
Optionally check first against a target date. The latest version is returned
204-
if multiple files match the criteria.
215+
s3_client code exists to list manifest filenames with their last modified dates.
216+
However, it is necessary to read the manifest files to get the snapshot created date
217+
and version number.
218+
219+
This code will start by filtering for the required
205220
206221
Args:
207222
survey_year (str): The survey year the snapshot belongs to.
@@ -212,7 +227,7 @@ def get_snapshot_name(
212227
"""
213228
# get a dictionary of all manifest files with the given prefix
214229
files_dict = rd_list_manifest_files(prefix)
215-
# filter the manifest files for the given date and wanted string
230+
# filter the manifest files for the given date (if given) and wanted string
216231
wanted_str = f"snapshot-{survey_year}12-002-"
217232
filtered_mani_file_list = filter_manifest_files(files_dict, spp_date, wanted_str)
218233
# check filtered files for metadata matching the target date and metadata validity

0 commit comments

Comments
 (0)