Skip to content

Commit bfa27e3

Browse files
authored
Make the synthesizer names consistent throughout SDGym (#433)
1 parent d9ba6e7 commit bfa27e3

File tree

11 files changed

+89
-36
lines changed

11 files changed

+89
-36
lines changed

.github/workflows/integration.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,11 @@ jobs:
1212
matrix:
1313
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
1414
os: [ubuntu-latest, windows-latest]
15-
exclude:
16-
- os: windows-latest
17-
python-version: '3.13'
1815
include:
1916
- os: macos-latest
2017
python-version: '3.8'
2118
- os: macos-latest
2219
python-version: '3.13'
23-
- os: windows-latest
24-
python-version: '3.13.6'
2520
steps:
2621
- uses: actions/checkout@v4
2722
- name: Set up Python ${{ matrix.python-version }}

.github/workflows/minimum.yml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,11 @@ jobs:
1717
matrix:
1818
python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
1919
os: [ubuntu-latest, windows-latest]
20-
exclude:
21-
- os: windows-latest
22-
python-version: '3.13'
2320
include:
2421
- os: macos-latest
2522
python-version: '3.8'
2623
- os: macos-latest
2724
python-version: '3.13'
28-
- os: windows-latest
29-
python-version: '3.13.6'
3025
steps:
3126
- uses: actions/checkout@v4
3227
- name: Set up Python ${{ matrix.python-version }}

sdgym/benchmark.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def _setup_output_destination_aws(output_destination, synthesizers, datasets, s3
163163
synth_folder = f'{dataset_folder}/{synth_name}'
164164
s3_client.put_object(Bucket=bucket_name, Key=synth_folder + '/')
165165
paths[dataset][synth_name] = {
166-
'synthesizer': f's3://{bucket_name}/{synth_folder}/{synth_name}_synthesizer.pkl',
166+
'synthesizer': f's3://{bucket_name}/{synth_folder}/{synth_name}.pkl',
167167
'synthetic_data': f's3://{bucket_name}/{synth_folder}/{synth_name}_synthetic_data.csv',
168168
'benchmark_result': f's3://{bucket_name}/{synth_folder}/{synth_name}_benchmark_result.csv',
169169
'results': f's3://{bucket_name}/{top_folder}/results_{today}_{increment}.csv',
@@ -212,7 +212,7 @@ def _setup_output_destination(output_destination, synthesizers, datasets, s3_cli
212212
synth_folder.mkdir(parents=True, exist_ok=True)
213213

214214
paths[dataset][synth_name] = {
215-
'synthesizer': str(synth_folder / f'{synth_name}_synthesizer.pkl'),
215+
'synthesizer': str(synth_folder / f'{synth_name}.pkl'),
216216
'synthetic_data': str(synth_folder / f'{synth_name}_synthetic_data.csv'),
217217
'benchmark_result': str(synth_folder / f'{synth_name}_benchmark_result.csv'),
218218
'run_id': str(top_folder / f'run_{today}_{increment}.yaml'),
@@ -1307,7 +1307,7 @@ def _get_user_data_script(access_key, secret_key, region_name, script_content):
13071307
13081308
echo "======== Install Dependencies in venv ============"
13091309
pip install --upgrade pip
1310-
pip install "sdgym[all] @ git+https://github.com/sdv-dev/SDGym.git@issue-425-workflow-sdgym#egg=sdgym"
1310+
pip install "sdgym[all] @ git+https://github.com/sdv-dev/SDGym.git@main"
13111311
pip install s3fs
13121312
13131313
echo "======== Write Script ==========="

sdgym/result_explorer/result_explorer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ def list(self):
3535

3636
def _get_file_path(self, results_folder_name, dataset_name, synthesizer_name, type):
3737
"""Validate access to the synthesizer or synthetic data file."""
38-
end_filename = f'{synthesizer_name}_'
38+
end_filename = f'{synthesizer_name}'
3939
if type == 'synthetic_data':
40-
end_filename += 'synthetic_data.csv'
40+
end_filename += '_synthetic_data.csv'
4141
elif type == 'synthesizer':
42-
end_filename += 'synthesizer.pkl'
42+
end_filename += '.pkl'
4343

4444
date = '_'.join(results_folder_name.split('_')[-3:])
4545
path_parts = [results_folder_name, f'{dataset_name}_{date}', synthesizer_name]

sdgym/result_explorer/result_handler.py

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
RESULTS_FOLDER_PREFIX = 'SDGym_results_'
1616
RUN_ID_PREFIX = 'run_'
1717
RESULTS_FILE_PREFIX = 'results_'
18+
NUM_DIGITS_DATE = 10
1819

1920

2021
class ResultsHandler(ABC):
@@ -103,7 +104,7 @@ def _get_column_name_infos(self, folder_to_results):
103104
results['Synthesizer'] == SYNTHESIZER_BASELINE, 'Dataset'
104105
].nunique()
105106
folder_to_info[folder] = {
106-
'date': run_id_info.get('starting_date')[:10], # Extract only the YYYY-MM-DD
107+
'date': run_id_info.get('starting_date')[:NUM_DIGITS_DATE],
107108
'sdgym_version': run_id_info.get('sdgym_version'),
108109
'# datasets': num_datasets,
109110
}
@@ -133,7 +134,7 @@ def summarize(self, folder_name):
133134
if folder_name not in all_folders:
134135
raise ValueError(f'Folder "{folder_name}" does not exist in the results directory.')
135136

136-
date = pd.to_datetime(folder_name[-10:], format='%m_%d_%Y')
137+
date = pd.to_datetime(folder_name[-NUM_DIGITS_DATE:], format='%m_%d_%Y')
137138
folder_to_results = {}
138139
for folder in all_folders:
139140
folder_date = pd.to_datetime(folder[len(RESULTS_FOLDER_PREFIX) :], format='%m_%d_%Y')
@@ -241,11 +242,68 @@ def list(self):
241242

242243
def get_file_path(self, path_parts, end_filename):
243244
"""Validate access to a specific file in S3."""
245+
idx_to_structure = {0: 'Folder', 1: 'Dataset', 2: 'Synthesizer'}
244246
file_path = '/'.join(path_parts + [end_filename])
247+
previous_s3_key = self.prefix
248+
for idx in range(len(path_parts)):
249+
level_name = idx_to_structure[idx]
250+
current_path = '/'.join(path_parts[: idx + 1]) + '/'
251+
s3_key = f'{self.prefix}{current_path}'
252+
response = self.s3_client.list_objects_v2(
253+
Bucket=self.bucket_name, Prefix=s3_key, MaxKeys=1
254+
)
255+
256+
if 'Contents' not in response:
257+
# If missing, fetch available items under previous level
258+
parent_response = self.s3_client.list_objects_v2(
259+
Bucket=self.bucket_name, Prefix=previous_s3_key
260+
)
261+
available_items = set()
262+
if 'Contents' in parent_response:
263+
for obj in parent_response['Contents']:
264+
rel_path = obj['Key'][len(previous_s3_key) :]
265+
if '/' in rel_path:
266+
folder = rel_path.split('/')[0]
267+
if folder:
268+
folder = folder[: -NUM_DIGITS_DATE - 1] if idx == 1 else folder
269+
available_items.add(folder)
270+
271+
folder_name = path_parts[idx]
272+
available_list = ',\n'.join(sorted(available_items)) or 'None'
273+
if level_name == 'Dataset':
274+
folder_name = folder_name[: -NUM_DIGITS_DATE - 1]
275+
276+
if level_name == 'Folder':
277+
raise ValueError(
278+
f"The specified run '{folder_name}' does not exist in 'Benchmarks'. "
279+
f'The available runs are:\n{available_list}'
280+
)
281+
elif level_name == 'Dataset':
282+
run_name = path_parts[0]
283+
raise ValueError(
284+
f"Dataset '{folder_name}' was not part of the run '{run_name}'. "
285+
f'The available datasets for this run are:\n{available_list}'
286+
)
287+
else:
288+
run_name = path_parts[0]
289+
dataset_name = path_parts[1][: -NUM_DIGITS_DATE - 1]
290+
raise ValueError(
291+
f"Synthesizer '{folder_name}' was not part of the run '{run_name}' "
292+
f"for the dataset '{dataset_name}'. "
293+
'The available synthesizers for this run and dataset are'
294+
f':\n{available_list}'
295+
)
296+
297+
previous_s3_key = s3_key
298+
299+
key = f'{self.prefix}{file_path}'
245300
try:
246-
self.s3_client.head_object(Bucket=self.bucket_name, Key=f'{self.prefix}{file_path}')
301+
self.s3_client.head_object(Bucket=self.bucket_name, Key=key)
247302
except ClientError as e:
248-
raise ValueError(f'S3 object does not exist: {file_path}') from e
303+
raise ValueError(
304+
f'File "{end_filename}" does not exist in S3 path: {self.prefix}{file_path}'
305+
) from e
306+
249307
return file_path
250308

251309
def load_synthesizer(self, file_path):

sdgym/run_benchmark/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def post_slack_message(channel, text):
9292

9393
def post_benchmark_launch_message(date_str):
9494
"""Post a message to the SDV Alerts Slack channel when the benchmark is launched."""
95-
channel = DEBUG_SLACK_CHANNEL
95+
channel = SLACK_CHANNEL
9696
folder_name = get_result_folder_name(date_str)
9797
bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
9898
url_link = get_s3_console_link(bucket, f'{prefix}{folder_name}/')
@@ -103,7 +103,7 @@ def post_benchmark_launch_message(date_str):
103103

104104
def post_benchmark_uploaded_message(folder_name, commit_url=None):
105105
"""Post benchmark uploaded message to sdv-alerts slack channel."""
106-
channel = DEBUG_SLACK_CHANNEL
106+
channel = SLACK_CHANNEL
107107
bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
108108
url_link = get_s3_console_link(bucket, f'{prefix}SDGym Monthly Run.xlsx')
109109
body = (

tests/integration/test_benchmark.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ def test_benchmark_single_table_with_output_destination(tmp_path):
630630
)
631631
)
632632
assert set(synthesizer_files) == {
633-
f'{synthesizer}_synthesizer.pkl',
633+
f'{synthesizer}.pkl',
634634
f'{synthesizer}_synthetic_data.csv',
635635
f'{synthesizer}_benchmark_result.csv',
636636
}
@@ -706,7 +706,7 @@ def test_benchmark_single_table_with_output_destination_multiple_runs(tmp_path):
706706
)
707707
)
708708
assert set(synthesizer_files) == {
709-
f'{synthesizer}_synthesizer.pkl',
709+
f'{synthesizer}.pkl',
710710
f'{synthesizer}_synthetic_data.csv',
711711
f'{synthesizer}_benchmark_result.csv',
712712
}

tests/unit/result_explorer/test_result_explorer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ def test__get_file_path(self):
113113
type = 'synthesizer'
114114
expected_filepath = (
115115
f'{results_folder_name}/{dataset_name}_07_07_2025/{synthesizer_name}/'
116-
f'{synthesizer_name}_synthesizer.pkl'
116+
f'{synthesizer_name}.pkl'
117117
)
118118
explorer._handler.get_file_path.return_value = expected_filepath
119119

@@ -125,7 +125,7 @@ def test__get_file_path(self):
125125
# Assert
126126
explorer._handler.get_file_path.assert_called_once_with(
127127
[results_folder_name, f'{dataset_name}_07_07_2025', synthesizer_name],
128-
f'{synthesizer_name}_synthesizer.pkl',
128+
f'{synthesizer_name}.pkl',
129129
)
130130
assert file_path == expected_filepath
131131

tests/unit/result_explorer/test_result_handler.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ def test_load_synthesizer(self, tmp_path):
244244
/ folder_name
245245
/ f'{dataset_name}_07_07_2025'
246246
/ synthesizer_name
247-
/ f'{synthesizer_name}_synthesizer.pkl'
247+
/ f'{synthesizer_name}.pkl'
248248
)
249249
synthesizer.save(synthesizer_path)
250250
result_handler = LocalResultsHandler(str(tmp_path))
@@ -397,19 +397,26 @@ def test_load_synthesizer(self):
397397
)
398398

399399
def test_get_file_path_s3(self):
400-
"""Test `get_file_path` for S3 path when files exist."""
400+
"""Test `get_file_path` for S3 path when folders and file exist."""
401401
# Setup
402402
mock_s3_client = Mock()
403403
handler = S3ResultsHandler('s3://my-bucket/prefix', mock_s3_client)
404404
path_parts = ['results_folder_07_07_2025', 'my_dataset']
405405
end_filename = 'synthesizer.pkl'
406406
file_path = 'results_folder_07_07_2025/my_dataset/synthesizer.pkl'
407+
mock_s3_client.list_objects_v2.return_value = {'Contents': [{}]}
407408

408409
# Run
409410
result = handler.get_file_path(path_parts, end_filename)
410411

411412
# Assert
412413
assert result == file_path
414+
mock_s3_client.list_objects_v2.assert_any_call(
415+
Bucket='my-bucket', Prefix='prefix/results_folder_07_07_2025/', MaxKeys=1
416+
)
417+
mock_s3_client.list_objects_v2.assert_any_call(
418+
Bucket='my-bucket', Prefix='prefix/results_folder_07_07_2025/my_dataset/', MaxKeys=1
419+
)
413420
mock_s3_client.head_object.assert_called_once_with(
414421
Bucket='my-bucket', Key='prefix/results_folder_07_07_2025/my_dataset/synthesizer.pkl'
415422
)

tests/unit/run_benchmark/test_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
import pytest
55

66
from sdgym.run_benchmark.utils import (
7-
DEBUG_SLACK_CHANNEL,
87
GDRIVE_LINK,
98
OUTPUT_DESTINATION_AWS,
9+
SLACK_CHANNEL,
1010
_get_slack_client,
1111
get_df_to_plot,
1212
get_result_folder_name,
@@ -105,7 +105,7 @@ def test_post_benchmark_launch_message(
105105
mock_get_result_folder_name.assert_called_once_with(date_str)
106106
mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS)
107107
mock_get_s3_console_link.assert_called_once_with('my-bucket', f'my-prefix/{folder_name}/')
108-
mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body)
108+
mock_post_slack_message.assert_called_once_with(SLACK_CHANNEL, expected_body)
109109

110110

111111
@patch('sdgym.run_benchmark.utils.post_slack_message')
@@ -133,7 +133,7 @@ def test_post_benchmark_uploaded_message(
133133
post_benchmark_uploaded_message(folder_name)
134134

135135
# Assert
136-
mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body)
136+
mock_post_slack_message.assert_called_once_with(SLACK_CHANNEL, expected_body)
137137
mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS)
138138
mock_get_s3_console_link.assert_called_once_with(
139139
'my-bucket', 'my-prefix/SDGym Monthly Run.xlsx'
@@ -167,7 +167,7 @@ def test_post_benchmark_uploaded_message_with_commit(
167167
post_benchmark_uploaded_message(folder_name, commit_url)
168168

169169
# Assert
170-
mock_post_slack_message.assert_called_once_with(DEBUG_SLACK_CHANNEL, expected_body)
170+
mock_post_slack_message.assert_called_once_with(SLACK_CHANNEL, expected_body)
171171
mock_parse_s3_path.assert_called_once_with(OUTPUT_DESTINATION_AWS)
172172
mock_get_s3_console_link.assert_called_once_with(
173173
'my-bucket', 'my-prefix/SDGym Monthly Run.xlsx'

0 commit comments

Comments
 (0)