Skip to content

Commit 07b1988

Browse files
committed
make release-tag: Merge branch 'main' into stable
2 parents 916bc41 + 95743fc commit 07b1988

29 files changed

+1040
-709
lines changed

DATASETS.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,12 @@ Out[6]:
6666
## Getting the list of all the datasets
6767

6868
If you want to obtain the list of all the available datasets you can use the
69-
`sdgym.get_available_datasets` function:
69+
`list_datasets` function:
7070

7171
```python
72-
In [7]: from sdgym import get_available_datasets
72+
In [7]: from sdgym.dataset_explorer import DatasetExplorer
7373

74-
In [8]: get_available_datasets()
74+
In [8]: DatasetExplorer().list_datasets()
7575
Out[8]:
7676
dataset_name size_MB num_tables
7777
0 KRK_v1 0.072128 1

HISTORY.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
# History
22

3+
## v0.12.0 - 2025-11-20
4+
5+
### New Features
6+
7+
* Rename create_sdv_synthesizer_variant to create_synthesizer_variant - Issue [#491](https://github.com/sdv-dev/SDGym/issues/491) by @R-Palazzo
8+
* SDGym should be able to automatically discover SDV Enterprise synthesizers - Issue [#481](https://github.com/sdv-dev/SDGym/issues/481) by @R-Palazzo
9+
* Incorporate the `get_available_datasets` functionality into the `DatasetExplorer` - Issue [#473](https://github.com/sdv-dev/SDGym/issues/473) by @fealho
10+
11+
### Bugs Fixed
12+
13+
* Update result aggregation logic in the ResultExplorer to match new naming schema - Issue [#494](https://github.com/sdv-dev/SDGym/issues/494) by @R-Palazzo
14+
* When running a benchmark locally, the `additional_datasets_folder` path should be the root path - Issue [#484](https://github.com/sdv-dev/SDGym/issues/484) by @fealho
15+
316
## v0.11.1 - 2025-11-03
417

518
### Bugs Fixed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,10 +103,10 @@ Learn more in the [Custom Synthesizers Guide](https://docs.sdv.dev/sdgym/customi
103103
## Customizing your datasets
104104

105105
The SDGym library includes many publicly available datasets that you can include right away.
106-
List these using the ``get_available_datasets`` feature.
106+
List these using the ``list_datasets`` feature.
107107

108108
```python
109-
sdgym.get_available_datasets()
109+
sdgym.dataset_explorer.DatasetExplorer().list_datasets()
110110
```
111111

112112
```

latest_requirements.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
appdirs==1.4.4
22
compress-pickle==2.1.0
33
humanfriendly==10.0
4-
numpy==2.3.4
4+
numpy==2.3.5
5+
openpyxl==3.1.5
56
pandas==2.3.3
67
rdt==1.18.2
78
scikit-learn==1.7.2
89
scipy==1.16.3
9-
sdmetrics==0.23.0
10-
sdv==1.28.0
10+
sdmetrics==0.24.0
11+
sdv==1.29.0
1112
tabulate==0.8.10
12-
torch==2.9.0
13+
torch==2.9.1
1314
tqdm==4.67.1

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ namespaces = false
144144
version = {attr = 'sdgym.__version__'}
145145

146146
[tool.bumpversion]
147-
current_version = "0.11.1"
147+
current_version = "0.12.0.dev0"
148148
parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
149149
serialize = [
150150
'{major}.{minor}.{patch}.{release}{candidate}',

sdgym/__init__.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,20 @@
88
__copyright__ = 'Copyright (c) 2022 DataCebo, Inc.'
99
__email__ = '[email protected]'
1010
__license__ = 'BSL-1.1'
11-
__version__ = '0.11.1'
11+
__version__ = '0.12.0.dev0'
1212

1313
import logging
1414

15-
from sdgym.benchmark import benchmark_single_table
15+
from sdgym.benchmark import benchmark_single_table, benchmark_single_table_aws
1616
from sdgym.cli.collect import collect_results
1717
from sdgym.cli.summary import make_summary_spreadsheet
1818
from sdgym.dataset_explorer import DatasetExplorer
19-
from sdgym.datasets import get_available_datasets, load_dataset
20-
from sdgym.synthesizers import create_sdv_synthesizer_variant, create_single_table_synthesizer
19+
from sdgym.datasets import load_dataset
20+
from sdgym.synthesizers import (
21+
create_synthesizer_variant,
22+
create_single_table_synthesizer,
23+
create_multi_table_synthesizer,
24+
)
2125
from sdgym.result_explorer import ResultsExplorer
2226

2327
# Clear the logging wrongfully configured by tensorflow/absl
@@ -28,10 +32,11 @@
2832
'DatasetExplorer',
2933
'ResultsExplorer',
3034
'benchmark_single_table',
35+
'benchmark_single_table_aws',
3136
'collect_results',
32-
'create_sdv_synthesizer_variant',
37+
'create_synthesizer_variant',
3338
'create_single_table_synthesizer',
34-
'get_available_datasets',
39+
'create_multi_table_synthesizer',
3540
'load_dataset',
3641
'make_summary_spreadsheet',
3742
]

sdgym/benchmark.py

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
write_csv,
5353
write_file,
5454
)
55-
from sdgym.synthesizers import CTGANSynthesizer, GaussianCopulaSynthesizer, UniformSynthesizer
55+
from sdgym.synthesizers import UniformSynthesizer
5656
from sdgym.synthesizers.base import BaselineSynthesizer
5757
from sdgym.utils import (
5858
calculate_score_time,
@@ -67,7 +67,7 @@
6767
)
6868

6969
LOGGER = logging.getLogger(__name__)
70-
DEFAULT_SYNTHESIZERS = [GaussianCopulaSynthesizer, CTGANSynthesizer, UniformSynthesizer]
70+
DEFAULT_SYNTHESIZERS = ['GaussianCopulaSynthesizer', 'CTGANSynthesizer', 'UniformSynthesizer']
7171
DEFAULT_DATASETS = [
7272
'adult',
7373
'alarm',
@@ -271,7 +271,11 @@ def _generate_job_args_list(
271271
if additional_datasets_folder is None
272272
else get_dataset_paths(
273273
modality='single_table',
274-
bucket=additional_datasets_folder,
274+
bucket=(
275+
additional_datasets_folder
276+
if is_s3_path(additional_datasets_folder)
277+
else os.path.join(additional_datasets_folder, 'single_table')
278+
),
275279
aws_access_key_id=aws_access_key_id,
276280
aws_secret_access_key=aws_secret_access_key_key,
277281
)
@@ -861,6 +865,7 @@ def _directory_exists(bucket_name, s3_file_path):
861865

862866

863867
def _check_write_permissions(s3_client, bucket_name):
868+
s3_client = s3_client or boto3.client('s3')
864869
try:
865870
s3_client.put_object(Bucket=bucket_name, Key='__test__', Body=b'')
866871
write_permission = True
@@ -881,7 +886,7 @@ def _create_sdgym_script(params, output_filepath):
881886
bucket_name, key_prefix = parse_s3_path(output_filepath)
882887
if not _directory_exists(bucket_name, key_prefix):
883888
raise ValueError(f'Directories in {key_prefix} do not exist')
884-
if not _check_write_permissions(bucket_name):
889+
if not _check_write_permissions(None, bucket_name):
885890
raise ValueError('No write permissions allowed for the bucket.')
886891

887892
# Add quotes to parameter strings
@@ -893,23 +898,22 @@ def _create_sdgym_script(params, output_filepath):
893898
params['output_filepath'] = "'" + params['output_filepath'] + "'"
894899

895900
# Generate the output script to run on the e2 instance
896-
synthesizer_string = 'synthesizers=['
897-
for synthesizer in params['synthesizers']:
901+
synthesizers = params.get('synthesizers', [])
902+
names = []
903+
for synthesizer in synthesizers:
898904
if isinstance(synthesizer, str):
899-
synthesizer_string += synthesizer + ', '
905+
names.append(synthesizer)
906+
elif hasattr(synthesizer, '__name__'):
907+
names.append(synthesizer.__name__)
900908
else:
901-
synthesizer_string += synthesizer.__name__ + ', '
902-
if params['synthesizers']:
903-
synthesizer_string = synthesizer_string[:-2]
904-
synthesizer_string += ']'
909+
names.append(synthesizer.__class__.__name__)
910+
911+
all_names = '", "'.join(names)
912+
synthesizer_string = f'synthesizers=["{all_names}"]'
905913
# The indentation of the string is important for the python script
906914
script_content = f"""import boto3
907915
from io import StringIO
908916
import sdgym
909-
from sdgym.synthesizers.sdv import (CopulaGANSynthesizer, CTGANSynthesizer,
910-
GaussianCopulaSynthesizer, HMASynthesizer, PARSynthesizer, SDVRelationalSynthesizer,
911-
SDVTabularSynthesizer, TVAESynthesizer)
912-
from sdgym.synthesizers import RealTabFormerSynthesizer
913917
914918
results = sdgym.benchmark_single_table(
915919
{synthesizer_string}, custom_synthesizers={params['custom_synthesizers']},
@@ -1186,7 +1190,7 @@ def benchmark_single_table(
11861190
custom_synthesizers (list[class] or ``None``):
11871191
A list of custom synthesizer classes to use. These can be completely custom or
11881192
they can be synthesizer variants (the output from ``create_single_table_synthesizer``
1189-
or ``create_sdv_synthesizer_variant``). Defaults to ``None``.
1193+
or ``create_synthesizer_variant``). Defaults to ``None``.
11901194
sdv_datasets (list[str] or ``None``):
11911195
Names of the SDV demo datasets to use for the benchmark. Defaults to
11921196
``[adult, alarm, census, child, expedia_hotel_logs, insurance, intrusion, news,

sdgym/cli/__main__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def _download_datasets(args):
9797
_env_setup(args.logfile, args.verbose)
9898
datasets = args.datasets
9999
if not datasets:
100-
datasets = sdgym.datasets.get_available_datasets(
100+
datasets = sdgym.datasets._get_available_datasets(
101101
args.bucket, args.aws_access_key_id, args.aws_secret_access_key
102102
)['name']
103103

@@ -118,7 +118,7 @@ def _list_downloaded(args):
118118

119119

120120
def _list_available(args):
121-
datasets = sdgym.datasets.get_available_datasets(
121+
datasets = sdgym.datasets._get_available_datasets(
122122
args.bucket, args.aws_access_key_id, args.aws_secret_access_key
123123
)
124124
_print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size})

sdgym/dataset_explorer.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,3 +275,36 @@ def summarize_datasets(self, modality, output_filepath=None):
275275
dataset_summary.to_csv(output_filepath, index=False)
276276

277277
return dataset_summary
278+
279+
def list_datasets(self, modality, output_filepath=None):
280+
"""List available datasets for a modality using metainfo only.
281+
282+
This is a lightweight alternative to ``summarize_datasets`` that does not load
283+
the actual data. It reads dataset information from the ``metainfo.yaml`` files
284+
in the bucket and returns a table equivalent to the legacy
285+
``get_available_datasets`` output.
286+
287+
Args:
288+
modality (str):
289+
It must be ``'single_table'``, ``'multi_table'`` or ``'sequential'``.
290+
output_filepath (str, optional):
291+
Full path to a ``.csv`` file where the resulting table will be written.
292+
If not provided, the table is only returned.
293+
294+
Returns:
295+
pd.DataFrame:
296+
A DataFrame with columns: ``['dataset_name', 'size_MB', 'num_tables']``.
297+
"""
298+
self._validate_output_filepath(output_filepath)
299+
_validate_modality(modality)
300+
301+
dataframe = _get_available_datasets(
302+
modality=modality,
303+
bucket=self._bucket_name,
304+
aws_access_key_id=self.aws_access_key_id,
305+
aws_secret_access_key=self.aws_secret_access_key,
306+
)
307+
if output_filepath:
308+
dataframe.to_csv(output_filepath, index=False)
309+
310+
return dataframe

sdgym/datasets.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -254,21 +254,6 @@ def load_dataset(
254254
return data, metadata_dict
255255

256256

257-
def get_available_datasets(modality='single_table'):
258-
"""Get available single_table datasets.
259-
260-
Args:
261-
modality (str):
262-
It must be ``'single_table'``, ``'multi_table'`` or ``'sequential'``.
263-
264-
Return:
265-
pd.DataFrame:
266-
Table of available datasets and their sizes.
267-
"""
268-
_validate_modality(modality)
269-
return _get_available_datasets(modality)
270-
271-
272257
def get_dataset_paths(
273258
modality,
274259
datasets=None,

0 commit comments

Comments
 (0)