Make the synthesizer names consistent throughout SDGym (#433)

R-Palazzo · web-flow · commit bfa27e335708 · 2025-09-03T18:06:24.000+02:00
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -12,16 +12,11 @@ jobs:
       matrix:
         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
         os: [ubuntu-latest, windows-latest]
-        exclude:
-          - os: windows-latest
-            python-version: '3.13'
         include:
           - os: macos-latest
             python-version: '3.8'
           - os: macos-latest
             python-version: '3.13'
-          - os: windows-latest
-            python-version: '3.13.6'
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml
@@ -17,16 +17,11 @@ jobs:
       matrix:
         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
         os: [ubuntu-latest, windows-latest]
-        exclude:
-          - os: windows-latest
-            python-version: '3.13'
         include:
           - os: macos-latest
             python-version: '3.8'
           - os: macos-latest
             python-version: '3.13'
-          - os: windows-latest
-            python-version: '3.13.6'
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py
@@ -163,7 +163,7 @@ def _setup_output_destination_aws(output_destination, synthesizers, datasets, s3
             synth_folder = f'{dataset_folder}/{synth_name}'
             s3_client.put_object(Bucket=bucket_name, Key=synth_folder + '/')
             paths[dataset][synth_name] = {
-                'synthesizer': f's3://{bucket_name}/{synth_folder}/{synth_name}_synthesizer.pkl',
+                'synthesizer': f's3://{bucket_name}/{synth_folder}/{synth_name}.pkl',
                 'synthetic_data': f's3://{bucket_name}/{synth_folder}/{synth_name}_synthetic_data.csv',
                 'benchmark_result': f's3://{bucket_name}/{synth_folder}/{synth_name}_benchmark_result.csv',
                 'results': f's3://{bucket_name}/{top_folder}/results_{today}_{increment}.csv',
@@ -212,7 +212,7 @@ def _setup_output_destination(output_destination, synthesizers, datasets, s3_cli
             synth_folder.mkdir(parents=True, exist_ok=True)
 
             paths[dataset][synth_name] = {
-                'synthesizer': str(synth_folder / f'{synth_name}_synthesizer.pkl'),
+                'synthesizer': str(synth_folder / f'{synth_name}.pkl'),
                 'synthetic_data': str(synth_folder / f'{synth_name}_synthetic_data.csv'),
                 'benchmark_result': str(synth_folder / f'{synth_name}_benchmark_result.csv'),
                 'run_id': str(top_folder / f'run_{today}_{increment}.yaml'),
@@ -1307,7 +1307,7 @@ def _get_user_data_script(access_key, secret_key, region_name, script_content):
 
         echo "======== Install Dependencies in venv ============"
         pip install --upgrade pip
-        pip install "sdgym[all] @ git+https://github.com/sdv-dev/SDGym.git@issue-425-workflow-sdgym#egg=sdgym"
+        pip install "sdgym[all] @ git+https://github.com/sdv-dev/SDGym.git@main"
         pip install s3fs
 
         echo "======== Write Script ==========="
diff --git a/sdgym/result_explorer/result_explorer.py b/sdgym/result_explorer/result_explorer.py
@@ -35,11 +35,11 @@ def list(self):
 
     def _get_file_path(self, results_folder_name, dataset_name, synthesizer_name, type):
         """Validate access to the synthesizer or synthetic data file."""
-        end_filename = f'{synthesizer_name}_'
+        end_filename = f'{synthesizer_name}'
         if type == 'synthetic_data':
-            end_filename += 'synthetic_data.csv'
+            end_filename += '_synthetic_data.csv'
         elif type == 'synthesizer':
-            end_filename += 'synthesizer.pkl'
+            end_filename += '.pkl'
 
         date = '_'.join(results_folder_name.split('_')[-3:])
         path_parts = [results_folder_name, f'{dataset_name}_{date}', synthesizer_name]
diff --git a/sdgym/result_explorer/result_handler.py b/sdgym/result_explorer/result_handler.py
@@ -15,6 +15,7 @@
 RESULTS_FOLDER_PREFIX = 'SDGym_results_'
 RUN_ID_PREFIX = 'run_'
 RESULTS_FILE_PREFIX = 'results_'
+NUM_DIGITS_DATE = 10
 
 
 class ResultsHandler(ABC):
@@ -103,7 +104,7 @@ def _get_column_name_infos(self, folder_to_results):
                 results['Synthesizer'] == SYNTHESIZER_BASELINE, 'Dataset'
             ].nunique()
             folder_to_info[folder] = {
-                'date': run_id_info.get('starting_date')[:10],  # Extract only the YYYY-MM-DD
+                'date': run_id_info.get('starting_date')[:NUM_DIGITS_DATE],
                 'sdgym_version': run_id_info.get('sdgym_version'),
                 '# datasets': num_datasets,
             }
@@ -133,7 +134,7 @@ def summarize(self, folder_name):
         if folder_name not in all_folders:
             raise ValueError(f'Folder "{folder_name}" does not exist in the results directory.')
 
-        date = pd.to_datetime(folder_name[-10:], format='%m_%d_%Y')
+        date = pd.to_datetime(folder_name[-NUM_DIGITS_DATE:], format='%m_%d_%Y')
         folder_to_results = {}
         for folder in all_folders:
             folder_date = pd.to_datetime(folder[len(RESULTS_FOLDER_PREFIX) :], format='%m_%d_%Y')
@@ -241,11 +242,68 @@ def list(self):
 
     def get_file_path(self, path_parts, end_filename):
         """Validate access to a specific file in S3."""
+        idx_to_structure = {0: 'Folder', 1: 'Dataset', 2: 'Synthesizer'}
         file_path = '/'.join(path_parts + [end_filename])
+        previous_s3_key = self.prefix
+        for idx in range(len(path_parts)):
+            level_name = idx_to_structure[idx]
+            current_path = '/'.join(path_parts[: idx + 1]) + '/'
+            s3_key = f'{self.prefix}{current_path}'
+            response = self.s3_client.list_objects_v2(
+                Bucket=self.bucket_name, Prefix=s3_key, MaxKeys=1
+            )
+
+            if 'Contents' not in response:
+                # If missing, fetch available items under previous level
+                parent_response = self.s3_client.list_objects_v2(
+                    Bucket=self.bucket_name, Prefix=previous_s3_key
+                )
+                available_items = set()
+                if 'Contents' in parent_response:
+                    for obj in parent_response['Contents']:
+                        rel_path = obj['Key'][len(previous_s3_key) :]
+                        if '/' in rel_path:
+                            folder = rel_path.split('/')[0]
+                            if folder:
+                                folder = folder[: -NUM_DIGITS_DATE - 1] if idx == 1 else folder
+                                available_items.add(folder)
+
+                folder_name = path_parts[idx]
+                available_list = ',\n'.join(sorted(available_items)) or 'None'
+                if level_name == 'Dataset':
+                    folder_name = folder_name[: -NUM_DIGITS_DATE - 1]
+
+                if level_name == 'Folder':
+                    raise ValueError(
+                        f"The specified run '{folder_name}' does not exist in 'Benchmarks'. "
+                        f'The available runs are:\n{available_list}'
+                    )
+                elif level_name == 'Dataset':
+                    run_name = path_parts[0]
+                    raise ValueError(
+                        f"Dataset '{folder_name}' was not part of the run '{run_name}'. "
+                        f'The available datasets for this run are:\n{available_list}'
+                    )
+                else:
+                    run_name = path_parts[0]
+                    dataset_name = path_parts[1][: -NUM_DIGITS_DATE - 1]
+                    raise ValueError(
+                        f"Synthesizer '{folder_name}' was not part of the run '{run_name}' "
+                        f"for the dataset '{dataset_name}'. "
+                        'The available synthesizers for this run and dataset are'
+                        f':\n{available_list}'
+                    )
+
+            previous_s3_key = s3_key
+
+        key = f'{self.prefix}{file_path}'
         try:
-            self.s3_client.head_object(Bucket=self.bucket_name, Key=f'{self.prefix}{file_path}')
+            self.s3_client.head_object(Bucket=self.bucket_name, Key=key)
         except ClientError as e:
-            raise ValueError(f'S3 object does not exist: {file_path}') from e
+            raise ValueError(
+                f'File "{end_filename}" does not exist in S3 path: {self.prefix}{file_path}'
+            ) from e
+
         return file_path
 
     def load_synthesizer(self, file_path):
diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py
@@ -92,7 +92,7 @@ def post_slack_message(channel, text):
 
 def post_benchmark_launch_message(date_str):
     """Post a message to the SDV Alerts Slack channel when the benchmark is launched."""
-    channel = DEBUG_SLACK_CHANNEL
+    channel = SLACK_CHANNEL
     folder_name = get_result_folder_name(date_str)
     bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
     url_link = get_s3_console_link(bucket, f'{prefix}{folder_name}/')
@@ -103,7 +103,7 @@ def post_benchmark_launch_message(date_str):
 
 def post_benchmark_uploaded_message(folder_name, commit_url=None):
     """Post benchmark uploaded message to sdv-alerts slack channel."""
-    channel = DEBUG_SLACK_CHANNEL
+    channel = SLACK_CHANNEL
     bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
     url_link = get_s3_console_link(bucket, f'{prefix}SDGym Monthly Run.xlsx')
     body = (
diff --git a/tests/integration/test_benchmark.py b/tests/integration/test_benchmark.py
@@ -630,7 +630,7 @@ def test_benchmark_single_table_with_output_destination(tmp_path):
             )
         )
         assert set(synthesizer_files) == {
-            f'{synthesizer}_synthesizer.pkl',
+            f'{synthesizer}.pkl',
             f'{synthesizer}_synthetic_data.csv',
             f'{synthesizer}_benchmark_result.csv',
         }
@@ -706,7 +706,7 @@ def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path):
             )
         )
         assert set(synthesizer_files) == {
-            f'{synthesizer}_synthesizer.pkl',
+            f'{synthesizer}.pkl',
             f'{synthesizer}_synthetic_data.csv',
             f'{synthesizer}_benchmark_result.csv',
         }
diff --git a/tests/unit/result_explorer/test_result_explorer.py b/tests/unit/result_explorer/test_result_explorer.py
@@ -113,7 +113,7 @@ def test__get_file_path(self):
         type = 'synthesizer'
         expected_filepath = (
             f'{results_folder_name}/{dataset_name}_07_07_2025/{synthesizer_name}/'
-            f'{synthesizer_name}_synthesizer.pkl'
+            f'{synthesizer_name}.pkl'
         )
         explorer._handler.get_file_path.return_value = expected_filepath
 
@@ -125,7 +125,7 @@ def test__get_file_path(self):
         # Assert
         explorer._handler.get_file_path.assert_called_once_with(
             [results_folder_name, f'{dataset_name}_07_07_2025', synthesizer_name],
-            f'{synthesizer_name}_synthesizer.pkl',
+            f'{synthesizer_name}.pkl',
         )
         assert file_path == expected_filepath
 
diff --git a/tests/unit/result_explorer/test_result_handler.py b/tests/unit/result_explorer/test_result_handler.py
@@ -244,7 +244,7 @@ def test_load_synthesizer(self, tmp_path):
             / folder_name
             / f'{dataset_name}_07_07_2025'
             / synthesizer_name
-            / f'{synthesizer_name}_synthesizer.pkl'
+            / f'{synthesizer_name}.pkl'
         )
         synthesizer.save(synthesizer_path)
         result_handler = LocalResultsHandler(str(tmp_path))
@@ -397,19 +397,26 @@ def test_load_synthesizer(self):
         )
 
     def test_get_file_path_s3(self):
-        """Test `get_file_path` for S3 path when files exist."""
+        """Test `get_file_path` for S3 path when folders and file exist."""
         # Setup
         mock_s3_client = Mock()
         handler = S3ResultsHandler('s3://my-bucket/prefix', mock_s3_client)
         path_parts = ['results_folder_07_07_2025', 'my_dataset']
         end_filename = 'synthesizer.pkl'
         file_path = 'results_folder_07_07_2025/my_dataset/synthesizer.pkl'
+        mock_s3_client.list_objects_v2.return_value = {'Contents': [{}]}
 
         # Run
         result = handler.get_file_path(path_parts, end_filename)
 
         # Assert
         assert result == file_path
+        mock_s3_client.list_objects_v2.assert_any_call(
+            Bucket='my-bucket', Prefix='prefix/results_folder_07_07_2025/', MaxKeys=1
+        )
+        mock_s3_client.list_objects_v2.assert_any_call(
+            Bucket='my-bucket', Prefix='prefix/results_folder_07_07_2025/my_dataset/', MaxKeys=1
+        )
         mock_s3_client.head_object.assert_called_once_with(
             Bucket='my-bucket', Key='prefix/results_folder_07_07_2025/my_dataset/synthesizer.pkl'
         )
diff --git a/tests/unit/run_benchmark/test_utils.py b/tests/unit/run_benchmark/test_utils.py
@@ -4,9 +4,9 @@
 import pytest
 
 from sdgym.run_benchmark.utils import (
-    DEBUG_SLACK_CHANNEL,
     GDRIVE_LINK,
     OUTPUT_DESTINATION_AWS,
+    SLACK_CHANNEL,
     _get_slack_client,
     get_df_to_plot,
     get_result_folder_name,
@@ -105,7 +105,7 @@ def test_post_benchmark_launch_message(
     mock_get_result_folder_name.assert_called_once_with(date_str)
     mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS)
     mock_get_s3_console_link.assert_called_once_with('my-bucket', f'my-prefix/{folder_name}/')
-    mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body)
+    mock_post_slack_message.assert_called_once_with(SLACK_CHANNEL, expected_body)
 
 
 @patch('sdgym.run_benchmark.utils.post_slack_message')
@@ -133,7 +133,7 @@ def test_post_benchmark_uploaded_message(
     post_benchmark_uploaded_message(folder_name)
 
     # Assert
-    mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body)
+    mock_post_slack_message.assert_called_once_with(SLACK_CHANNEL, expected_body)
     mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS)
     mock_get_s3_console_link.assert_called_once_with(
         'my-bucket', 'my-prefix/SDGym Monthly Run.xlsx'
@@ -167,7 +167,7 @@ def test_post_benchmark_uploaded_message_with_commit(
     post_benchmark_uploaded_message(folder_name, commit_url)
 
     # Assert
-    mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body)
+    mock_post_slack_message.assert_called_once_with(SLACK_CHANNEL, expected_body)
     mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS)
     mock_get_s3_console_link.assert_called_once_with(
         'my-bucket', 'my-prefix/SDGym Monthly Run.xlsx'
diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py
@@ -410,9 +410,7 @@ def test__setup_output_destination(tmp_path):
         dataset: {
             **{
                 synth: {
-                    'synthesizer': str(
-                        base_path / f'{dataset}_{today}' / synth / f'{synth}_synthesizer.pkl'
-                    ),
+                    'synthesizer': str(base_path / f'{dataset}_{today}' / synth / f'{synth}.pkl'),
                     'synthetic_data': str(
                         base_path / f'{dataset}_{today}' / synth / f'{synth}_synthetic_data.csv'
                     ),
@@ -523,7 +521,7 @@ def test_setup_output_destination_aws(mock_get_run_id_increment):
         for synth in synthesizers:
             assert 'synthesizer' in paths[dataset][synth]
             assert paths[dataset][synth]['synthesizer'] == (
-                f's3://{bucket_name}/{top_folder}/{dataset}_{today}/{synth}/{synth}_synthesizer.pkl'
+                f's3://{bucket_name}/{top_folder}/{dataset}_{today}/{synth}/{synth}.pkl'
             )
             assert 'synthetic_data' in paths[dataset][synth]
             assert paths[dataset][synth]['synthetic_data'] == (

Original file line number	Diff line number	Diff line change
`@@ -630,7 +630,7 @@ def test_benchmark_single_table_with_output_destination(tmp_path):`
`630`	`630`	`)`
`631`	`631`	`)`
`632`	`632`	`assert set(synthesizer_files) == {`
`633`		`- f'{synthesizer}_synthesizer.pkl',`
	`633`	`+ f'{synthesizer}.pkl',`
`634`	`634`	`f'{synthesizer}_synthetic_data.csv',`
`635`	`635`	`f'{synthesizer}_benchmark_result.csv',`
`636`	`636`	`}`
`@@ -706,7 +706,7 @@ def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path):`
`706`	`706`	`)`
`707`	`707`	`)`
`708`	`708`	`assert set(synthesizer_files) == {`
`709`		`- f'{synthesizer}_synthesizer.pkl',`
	`709`	`+ f'{synthesizer}.pkl',`
`710`	`710`	`f'{synthesizer}_synthetic_data.csv',`
`711`	`711`	`f'{synthesizer}_benchmark_result.csv',`
`712`	`712`	`}`