|
15 | 15 | RESULTS_FOLDER_PREFIX = 'SDGym_results_' |
16 | 16 | RUN_ID_PREFIX = 'run_' |
17 | 17 | RESULTS_FILE_PREFIX = 'results_' |
| 18 | +NUM_DIGITS_DATE = 10 |
18 | 19 |
|
19 | 20 |
|
20 | 21 | class ResultsHandler(ABC): |
@@ -103,7 +104,7 @@ def _get_column_name_infos(self, folder_to_results): |
103 | 104 | results['Synthesizer'] == SYNTHESIZER_BASELINE, 'Dataset' |
104 | 105 | ].nunique() |
105 | 106 | folder_to_info[folder] = { |
106 | | - 'date': run_id_info.get('starting_date')[:10], # Extract only the YYYY-MM-DD |
| 107 | + 'date': run_id_info.get('starting_date')[:NUM_DIGITS_DATE], |
107 | 108 | 'sdgym_version': run_id_info.get('sdgym_version'), |
108 | 109 | '# datasets': num_datasets, |
109 | 110 | } |
@@ -133,7 +134,7 @@ def summarize(self, folder_name): |
133 | 134 | if folder_name not in all_folders: |
134 | 135 | raise ValueError(f'Folder "{folder_name}" does not exist in the results directory.') |
135 | 136 |
|
136 | | - date = pd.to_datetime(folder_name[-10:], format='%m_%d_%Y') |
| 137 | + date = pd.to_datetime(folder_name[-NUM_DIGITS_DATE:], format='%m_%d_%Y') |
137 | 138 | folder_to_results = {} |
138 | 139 | for folder in all_folders: |
139 | 140 | folder_date = pd.to_datetime(folder[len(RESULTS_FOLDER_PREFIX) :], format='%m_%d_%Y') |
@@ -241,11 +242,68 @@ def list(self): |
241 | 242 |
|
242 | 243 | def get_file_path(self, path_parts, end_filename): |
243 | 244 | """Validate access to a specific file in S3.""" |
| 245 | + idx_to_structure = {0: 'Folder', 1: 'Dataset', 2: 'Synthesizer'} |
244 | 246 | file_path = '/'.join(path_parts + [end_filename]) |
| 247 | + previous_s3_key = self.prefix |
| 248 | + for idx in range(len(path_parts)): |
| 249 | + level_name = idx_to_structure[idx] |
| 250 | + current_path = '/'.join(path_parts[: idx + 1]) + '/' |
| 251 | + s3_key = f'{self.prefix}{current_path}' |
| 252 | + response = self.s3_client.list_objects_v2( |
| 253 | + Bucket=self.bucket_name, Prefix=s3_key, MaxKeys=1 |
| 254 | + ) |
| 255 | + |
| 256 | + if 'Contents' not in response: |
| 257 | + # If missing, fetch available items under previous level |
| 258 | + parent_response = self.s3_client.list_objects_v2( |
| 259 | + Bucket=self.bucket_name, Prefix=previous_s3_key |
| 260 | + ) |
| 261 | + available_items = set() |
| 262 | + if 'Contents' in parent_response: |
| 263 | + for obj in parent_response['Contents']: |
| 264 | + rel_path = obj['Key'][len(previous_s3_key) :] |
| 265 | + if '/' in rel_path: |
| 266 | + folder = rel_path.split('/')[0] |
| 267 | + if folder: |
| 268 | + folder = folder[: -NUM_DIGITS_DATE - 1] if idx == 1 else folder |
| 269 | + available_items.add(folder) |
| 270 | + |
| 271 | + folder_name = path_parts[idx] |
| 272 | + available_list = ',\n'.join(sorted(available_items)) or 'None' |
| 273 | + if level_name == 'Dataset': |
| 274 | + folder_name = folder_name[: -NUM_DIGITS_DATE - 1] |
| 275 | + |
| 276 | + if level_name == 'Folder': |
| 277 | + raise ValueError( |
| 278 | + f"The specified run '{folder_name}' does not exist in 'Benchmarks'. " |
| 279 | + f'The available runs are:\n{available_list}' |
| 280 | + ) |
| 281 | + elif level_name == 'Dataset': |
| 282 | + run_name = path_parts[0] |
| 283 | + raise ValueError( |
| 284 | + f"Dataset '{folder_name}' was not part of the run '{run_name}'. " |
| 285 | + f'The available datasets for this run are:\n{available_list}' |
| 286 | + ) |
| 287 | + else: |
| 288 | + run_name = path_parts[0] |
| 289 | + dataset_name = path_parts[1][: -NUM_DIGITS_DATE - 1] |
| 290 | + raise ValueError( |
| 291 | + f"Synthesizer '{folder_name}' was not part of the run '{run_name}' " |
| 292 | + f"for the dataset '{dataset_name}'. " |
| 293 | + 'The available synthesizers for this run and dataset are' |
| 294 | + f':\n{available_list}' |
| 295 | + ) |
| 296 | + |
| 297 | + previous_s3_key = s3_key |
| 298 | + |
| 299 | + key = f'{self.prefix}{file_path}' |
245 | 300 | try: |
246 | | - self.s3_client.head_object(Bucket=self.bucket_name, Key=f'{self.prefix}{file_path}') |
| 301 | + self.s3_client.head_object(Bucket=self.bucket_name, Key=key) |
247 | 302 | except ClientError as e: |
248 | | - raise ValueError(f'S3 object does not exist: {file_path}') from e |
| 303 | + raise ValueError( |
| 304 | + f'File "{end_filename}" does not exist in S3 path: {self.prefix}{file_path}' |
| 305 | + ) from e |
| 306 | + |
249 | 307 | return file_path |
250 | 308 |
|
251 | 309 | def load_synthesizer(self, file_path): |
|
0 commit comments