11"""
2- Functions to obtain information about spp snapshots from metadata files.
2+ Functions to obtain information about spp snapshots from metadata manifest files.
33
44These scripts are designed to run in DAP S3 environment only.
55"""
66
77import logging
8+ import os
89from dataclasses import dataclass
910from datetime import datetime
1011
1314MetadataLogger = logging .getLogger (__name__ )
1415
1516
16- # Data classes for manifest file info and SPP metadata
17- @dataclass
18- class ManifestFileInfo :
19- mani_filename : str
20- mani_last_modified_date : datetime
21-
22-
2317@dataclass
2418class SPPMetadata :
2519 # include default values to check against
26- spp_filename : str = ""
27- spp_created_date : str = ""
28- version : int = 1
29- description : str = "SPP BERD snapshot files"
30- iterationL1 : str = "spp_snapshots"
31-
32-
33- @dataclass
34- class SnapshotCandidateFileInfo :
3520 mani_filename : str
3621 mani_last_modified_date : datetime
3722 spp_filename : str
3823 spp_created_date : str
3924 version : int
25+ description : str = "SPP BERD snapshot files"
26+ iterationL1 : str = "spp_snapshots"
4027
4128
4229def filter_manifest_files (
4330 files_dict : dict [str , datetime ], target_date : str = "" , wanted_str : str = ""
44- ) -> list [ManifestFileInfo ]:
45- """
46- Filter manifest files by target date and/ or filename substring.
31+ ) -> dict [str , datetime ]:
32+ """Filter manifest files by target date and/ or filename substring.
4733
4834 Args:
4935 files_dict (dict): Dictionary of {filename: last_modified_date}.
5036 target_date (str): The target date in 'YYYY-MM-DD' format.
5137 wanted_str (str): Substring that should be present in the filename.
5238
5339 Returns:
54- dict: Filtered dictionary with files matching the target date or substring .
40+ dict: Filtered dictionary of {filename: last_modified_date} .
5541 """
5642 if wanted_str :
5743 filtered_files = {
@@ -65,17 +51,28 @@ def filter_manifest_files(
6551 for filename , last_mod_date in files_dict .items ()
6652 if last_mod_date .strftime ("%Y-%m-%d" ) == target_date
6753 }
68- manifest_list = [
69- ManifestFileInfo (mani_filename = fn , mani_last_modified_date = lm )
70- for fn , lm in filtered_files .items ()
71- ]
72- return manifest_list
54+ return filtered_files
7355
7456
75- def get_spp_file_info_from_manifest (filename : str ) -> SPPMetadata :
76- """Get SPP file information from a manifest file as SPPMetadata."""
77- manif_file_dict = rd_load_json (filename )
57+ def get_spp_file_info_from_manifest (
58+ mani_filename : str , mani_last_modified_date : datetime
59+ ) -> SPPMetadata :
60+ """Get SPP file information from a manifest file as SPPMetadata.
61+
62+ Required fields are stored in a SPPMetadata data class for clarity.
63+
64+ Args:
65+ mani_filename (str): The manifest filename.
66+ mani_last_modified_date (datetime): The last modified date of the manifest file.
67+
68+ Returns:
69+ SPPMetadata: The SPPMetadata object with information from the manifest.
70+ """
71+ manif_file_dict = rd_load_json (mani_filename )
72+
7873 metadata = SPPMetadata (
74+ mani_filename = mani_filename ,
75+ mani_last_modified_date = mani_last_modified_date ,
7976 spp_filename = manif_file_dict ["files" ][0 ]["name" ],
8077 spp_created_date = str (manif_file_dict ["tdzComplete" ])[:10 ],
8178 version = manif_file_dict .get ("version" , 1 ),
@@ -88,8 +85,7 @@ def get_spp_file_info_from_manifest(filename: str) -> SPPMetadata:
8885def check_metadata (metadata : SPPMetadata , target_date : str ) -> dict :
8986 """Check the created date in metadata matches the target date and other validation.
9087
91- As well as checking for the created date, also checks description and iterationL1
92- fields to ensure data is as expected.
88+ Also check description and iterationL1 fields to ensure data is as expected.
9389
9490 Args:
9591 metadata (SPPMetadata): The SPPMetadata object to check.
@@ -98,15 +94,23 @@ def check_metadata(metadata: SPPMetadata, target_date: str) -> dict:
9894 Returns:
9995 dict: Dict of mismatched metadata entries, empty if all match.
10096 """
101- # create a SPPMetadata object with expected entries. The entries not specified will
102- # be checked against defaults set.
97+ # where no checks are needed, set expected fields to actual values
98+ exp_version = metadata .version
99+ exp_mani_filename = metadata .mani_filename
100+ exp_mani_last_modified_date = metadata .mani_last_modified_date
101+ # remove the path and .mani extension from the filename checking
102+ exp_spp_filename = os .path .basename (exp_mani_filename ).replace (".mani" , "" )
103103 if target_date == "" :
104- target_date = metadata .spp_created_date # no check needed
104+ target_date = metadata .spp_created_date
105105
106+ # Create an expected SPPMetadata object for checks.
106107 expected_metadata = SPPMetadata (
107- spp_filename = metadata .spp_filename , # no check needed
108- spp_created_date = target_date , # check created matches target date
109- version = metadata .version , # no check needed
108+ mani_filename = exp_mani_filename ,
109+ mani_last_modified_date = exp_mani_last_modified_date ,
110+ spp_filename = exp_spp_filename ,
111+ spp_created_date = target_date ,
112+ version = exp_version ,
113+ # the remaining fields will be checked against their default values
110114 )
111115
112116 error_dict = {
@@ -124,26 +128,24 @@ def check_metadata(metadata: SPPMetadata, target_date: str) -> dict:
124128 return {}
125129
126130
127- def check_files (
128- mani_files_list : list [ManifestFileInfo ], target_date : str = ""
129- ) -> list [SnapshotCandidateFileInfo ]:
130- """Test filtered files for metadata matching the target date and metadata validity.
131+ def check_files (mani_files_dict : dict , target_date : str = "" ) -> list [SPPMetadata ]:
132+ """Check filtered manifest file dict for metadata validity and (optionally) date.
133+
131134 Args:
132- files_dict (dict): {filename : last_modified_date}
135+ mani_files_dict (dict): {manifest_filename : last_modified_date}
133136 target_date (str): Target date in 'YYYY-MM-DD' format. If empty, no date check.
134137
135138 Returns:
136- dict: Dict with candidate file information if metadata matches target date.
139+ list: List with candidate file information if metadata matches target date.
137140 """
138141 candidate_file_list = []
139- for mani_file_info in mani_files_list :
140- spp_metadata = get_spp_file_info_from_manifest (mani_file_info . mani_filename )
142+ for mani_filename , mani_mod_date in mani_files_dict . items () :
143+ spp_metadata = get_spp_file_info_from_manifest (mani_filename , mani_mod_date )
141144 error_dict = check_metadata (spp_metadata , target_date )
142145 if not error_dict :
143- # create a CandidateFileInfo object and add to dict
144- new_candidate = SnapshotCandidateFileInfo (
145- mani_filename = mani_file_info .mani_filename ,
146- mani_last_modified_date = mani_file_info .mani_last_modified_date ,
146+ new_candidate = SPPMetadata (
147+ mani_filename = mani_filename ,
148+ mani_last_modified_date = mani_mod_date ,
147149 spp_filename = spp_metadata .spp_filename ,
148150 spp_created_date = spp_metadata .spp_created_date ,
149151 version = spp_metadata .version ,
@@ -152,7 +154,7 @@ def check_files(
152154 return candidate_file_list
153155
154156
155- def get_most_recent_file (file_list : list [SnapshotCandidateFileInfo ]) -> str :
157+ def get_most_recent_file (file_list : list [SPPMetadata ]) -> str :
156158 """Get the filename of the most recently modified file from a dictionary.
157159
158160 Args:
@@ -173,7 +175,7 @@ def get_most_recent_file(file_list: list[SnapshotCandidateFileInfo]) -> str:
173175
174176
175177def get_lastest_version_file (
176- file_list : list [SnapshotCandidateFileInfo ],
178+ file_list : list [SPPMetadata ],
177179) -> tuple [str , int , str ]:
178180 """Get filename of the highest version file from a list where filenames the same.
179181
@@ -184,22 +186,22 @@ def get_lastest_version_file(
184186 tuple: The filename of the highest version file and its version number.
185187 """
186188 if len (file_list ) > 1 :
187- # sort the list by version and get the highest version
188- sorted_files = sorted (file_list , key = lambda x : x .version , reverse = True )
189- highest_version_file = sorted_files [0 ].spp_filename
190- corresponding_version = sorted_files [0 ].version
191- corresponding_created_date = sorted_files [0 ].spp_created_date
192- else :
193- highest_version_file = file_list [0 ].spp_filename
194- corresponding_version = file_list [0 ].version
195- corresponding_created_date = file_list [0 ].spp_created_date
189+ # sort the list by version and get the highest version unless only one file
190+ file_list = sorted (file_list , key = lambda x : x .version , reverse = True )
191+ highest_version_file = file_list [0 ].spp_filename
192+ corresponding_version = file_list [0 ].version
193+ corresponding_created_date = file_list [0 ].spp_created_date
194+
196195 return highest_version_file , corresponding_version , corresponding_created_date
197196
198197
199- def get_snapshot_name_from_date (
200- prefix : str , survey_year : str , spp_date : str
198+ def get_snapshot_name (
199+ prefix : str , survey_year : str , spp_date : str = ""
201200) -> tuple [str , int , str ]:
202- """Return the name of an spp snapshot delivered on the given date.
201+ """Return the name, version and created date of an spp snapshot.
202+
203+ Optionally check first against a target date. The latest version is returned
204+ if multiple files match the criteria.
203205
204206 Args:
205207 survey_year (str): The survey year the snapshot belongs to.
@@ -215,8 +217,10 @@ def get_snapshot_name_from_date(
215217 filtered_mani_file_list = filter_manifest_files (files_dict , spp_date , wanted_str )
216218 # check filtered files for metadata matching the target date and metadata validity
217219 candidate_file_list = check_files (filtered_mani_file_list , spp_date )
218- # if no candidate files are found checking the last modified date, search all files
219- if not candidate_file_list :
220+
221+ # if we checked against a target date but no candidate files are found,
222+ # try filtering only by wanted string
223+ if not (spp_date == "" ) and not candidate_file_list :
220224 files_dict = filter_manifest_files (files_dict , wanted_str = wanted_str )
221225 candidate_file_list = check_files (filtered_mani_file_list , spp_date )
222226 if not candidate_file_list :
@@ -230,28 +234,3 @@ def get_snapshot_name_from_date(
230234 ]
231235 snapshot_name , version , created_date = get_lastest_version_file (duplicate_files )
232236 return snapshot_name , version , created_date
233-
234-
235- def get_latest_snapshot_name (prefix : str , survey_year : str ) -> tuple [str , int , str ]:
236- """Return the name of the latest spp snapshot for a given survey year.
237-
238- Args:
239- survey_year (str): The survey year the snapshot belongs to.
240-
241- Returns:
242- tuple: The name of the latest spp snapshot, its version, and created date.
243- """
244- # get a dictionary of all manifest files with the given prefix
245- files_dict = rd_list_manifest_files (prefix )
246- # filter the manifest files for the wanted string
247- wanted_str = f"snapshot-{ survey_year } 12-002-"
248- filtered_mani_file_list = filter_manifest_files (files_dict , wanted_str = wanted_str )
249- candidate_file_list = check_files (filtered_mani_file_list , target_date = "" )
250- # check filtered files for metadata validity
251- snapshot_name = get_most_recent_file (candidate_file_list )
252- # check whether snapshot filename is unique, if not get the highest version
253- duplicate_files = [
254- file for file in candidate_file_list if file .spp_filename == snapshot_name
255- ]
256- snapshot_name , version , created_date = get_lastest_version_file (duplicate_files )
257- return snapshot_name , version , created_date
0 commit comments