sdv-dev · R-Palazzo · Nov 14, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
@@ -12,12 +12,16 @@
 
 import logging
 
-from sdgym.benchmark import benchmark_single_table
+from sdgym.benchmark import benchmark_single_table, benchmark_single_table_aws
 from sdgym.cli.collect import collect_results
 from sdgym.cli.summary import make_summary_spreadsheet
 from sdgym.dataset_explorer import DatasetExplorer
 from sdgym.datasets import get_available_datasets, load_dataset
-from sdgym.synthesizers import create_sdv_synthesizer_variant, create_single_table_synthesizer
+from sdgym.synthesizers import (
+    create_synthesizer_variant,
+    create_single_table_synthesizer,
+    create_multi_table_synthesizer,
+)
 from sdgym.result_explorer import ResultsExplorer
 
 # Clear the logging wrongfully configured by tensorflow/absl
@@ -28,9 +32,11 @@
     'DatasetExplorer',
     'ResultsExplorer',
     'benchmark_single_table',
+    'benchmark_single_table_aws',
     'collect_results',
-    'create_sdv_synthesizer_variant',
+    'create_synthesizer_variant',
     'create_single_table_synthesizer',
+    'create_multi_table_synthesizer',
     'get_available_datasets',
     'load_dataset',
     'make_summary_spreadsheet',

@@ -52,7 +52,7 @@
     write_csv,
     write_file,
 )
-from sdgym.synthesizers import CTGANSynthesizer, GaussianCopulaSynthesizer, UniformSynthesizer
+from sdgym.synthesizers import UniformSynthesizer
 from sdgym.synthesizers.base import BaselineSynthesizer
 from sdgym.utils import (
     calculate_score_time,
@@ -67,7 +67,7 @@
 )
 
 LOGGER = logging.getLogger(__name__)
-DEFAULT_SYNTHESIZERS = [GaussianCopulaSynthesizer, CTGANSynthesizer, UniformSynthesizer]
+DEFAULT_SYNTHESIZERS = ['GaussianCopulaSynthesizer', 'CTGANSynthesizer', 'UniformSynthesizer']
 DEFAULT_DATASETS = [
     'adult',
     'alarm',
@@ -861,6 +861,7 @@ def _directory_exists(bucket_name, s3_file_path):
 
 
 def _check_write_permissions(s3_client, bucket_name):
+    s3_client = s3_client or boto3.client('s3')
     try:
         s3_client.put_object(Bucket=bucket_name, Key='__test__', Body=b'')
         write_permission = True
@@ -881,7 +882,7 @@ def _create_sdgym_script(params, output_filepath):
     bucket_name, key_prefix = parse_s3_path(output_filepath)
     if not _directory_exists(bucket_name, key_prefix):
         raise ValueError(f'Directories in {key_prefix} do not exist')
-    if not _check_write_permissions(bucket_name):
+    if not _check_write_permissions(None, bucket_name):
         raise ValueError('No write permissions allowed for the bucket.')
 
     # Add quotes to parameter strings
@@ -893,23 +894,22 @@ def _create_sdgym_script(params, output_filepath):
         params['output_filepath'] = "'" + params['output_filepath'] + "'"
 
     # Generate the output script to run on the e2 instance
-    synthesizer_string = 'synthesizers=['
-    for synthesizer in params['synthesizers']:
+    synthesizers = params.get('synthesizers', [])
+    names = []
+    for synthesizer in synthesizers:
         if isinstance(synthesizer, str):
-            synthesizer_string += synthesizer + ', '
+            names.append(synthesizer)
+        elif hasattr(synthesizer, '__name__'):
+            names.append(synthesizer.__name__)
         else:
-            synthesizer_string += synthesizer.__name__ + ', '
-    if params['synthesizers']:
-        synthesizer_string = synthesizer_string[:-2]
-    synthesizer_string += ']'
+            names.append(synthesizer.__class__.__name__)
+
+    all_names = '", "'.join(names)
+    synthesizer_string = f'synthesizers=["{all_names}"]'
     # The indentation of the string is important for the python script
     script_content = f"""import boto3
 from io import StringIO
 import sdgym
-from sdgym.synthesizers.sdv import (CopulaGANSynthesizer, CTGANSynthesizer,
-    GaussianCopulaSynthesizer, HMASynthesizer, PARSynthesizer, SDVRelationalSynthesizer,
-    SDVTabularSynthesizer, TVAESynthesizer)
-from sdgym.synthesizers import RealTabFormerSynthesizer
 
 results = sdgym.benchmark_single_table(
     {synthesizer_string}, custom_synthesizers={params['custom_synthesizers']},
@@ -1186,7 +1186,7 @@ def benchmark_single_table(
         custom_synthesizers (list[class] or ``None``):
             A list of custom synthesizer classes to use. These can be completely custom or
             they can be synthesizer variants (the output from ``create_single_table_synthesizer``
-            or ``create_sdv_synthesizer_variant``). Defaults to ``None``.
+            or ``create_synthesizer_variant``). Defaults to ``None``.
         sdv_datasets (list[str] or ``None``):
             Names of the SDV demo datasets to use for the benchmark. Defaults to
             ``[adult, alarm, census, child, expedia_hotel_logs, insurance, intrusion, news,

@@ -1,43 +1,32 @@
 """Synthesizers module."""
 
 from sdgym.synthesizers.generate import (
-    SYNTHESIZER_MAPPING,
-    create_multi_table_synthesizer,
-    create_sdv_synthesizer_variant,
-    create_sequential_synthesizer,
+    create_synthesizer_variant,
     create_single_table_synthesizer,
+    create_multi_table_synthesizer,
 )
 from sdgym.synthesizers.identity import DataIdentity
 from sdgym.synthesizers.column import ColumnSynthesizer
 from sdgym.synthesizers.realtabformer import RealTabFormerSynthesizer
-from sdgym.synthesizers.sdv import (
-    CopulaGANSynthesizer,
-    CTGANSynthesizer,
-    GaussianCopulaSynthesizer,
-    HMASynthesizer,
-    PARSynthesizer,
-    SDVRelationalSynthesizer,
-    SDVTabularSynthesizer,
-    TVAESynthesizer,
-)
 from sdgym.synthesizers.uniform import UniformSynthesizer
+from sdgym.synthesizers.utils import (
+    get_available_single_table_synthesizers,
+    get_available_multi_table_synthesizers,
+)
+from sdgym.synthesizers.sdv import create_sdv_synthesizer_class, _get_all_sdv_synthesizers
+
 
-__all__ = (
+__all__ = [
     'DataIdentity',
     'ColumnSynthesizer',
-    'CTGANSynthesizer',
-    'TVAESynthesizer',
     'UniformSynthesizer',
-    'CopulaGANSynthesizer',
-    'GaussianCopulaSynthesizer',
-    'HMASynthesizer',
-    'PARSynthesizer',
-    'SDVTabularSynthesizer',
-    'SDVRelationalSynthesizer',
+    'RealTabFormerSynthesizer',
     'create_single_table_synthesizer',
     'create_multi_table_synthesizer',
-    'create_sdv_synthesizer_variant',
-    'create_sequential_synthesizer',
-    'SYNTHESIZER_MAPPING',
-    'RealTabFormerSynthesizer',
-)
+    'create_synthesizer_variant',
+    'get_available_single_table_synthesizers',
+    'get_available_multi_table_synthesizers',
+]
+
+for sdv_name in _get_all_sdv_synthesizers():
+    create_sdv_synthesizer_class(sdv_name)
@@ -12,6 +12,9 @@
 class BaselineSynthesizer(abc.ABC):
     """Base class for all the ``SDGym`` baselines."""
 
+    _MODEL_KWARGS = {}
+    _NATIVELY_SUPPORTED = True
+
     @classmethod
     def get_subclasses(cls, include_parents=False):
         """Recursively find subclasses of this Baseline.
@@ -30,6 +33,17 @@ def get_subclasses(cls, include_parents=False):
 
         return subclasses
 
+    @classmethod
+    def _get_supported_synthesizers(cls):
+        """Get the natively supported synthesizer class names."""
+        subclasses = cls.get_subclasses(include_parents=True)
+        synthesizers = set()
+        for name, subclass in subclasses.items():
+            if subclass._NATIVELY_SUPPORTED:
+                synthesizers.add(name)
+
+        return sorted(synthesizers)
+
     @classmethod
     def get_baselines(cls):
         """Get baseline classes."""
@@ -76,79 +90,3 @@ def sample_from_synthesizer(self, synthesizer, n_samples):
                 should be a dict mapping table name to DataFrame.
         """
         return self._sample_from_synthesizer(synthesizer, n_samples)
-
-
-class MultiSingleTableBaselineSynthesizer(BaselineSynthesizer, abc.ABC):
-    """Base class for SingleTableBaselines that are used on multi table scenarios.
-
-    These classes model and sample each table independently and then just
-    randomly choose ids from the parent tables to form the relationships.
-
-    NOTE: doesn't currently work.
-    """
-
-    def get_trained_synthesizer(self, data, metadata):
-        """Get the trained synthesizer.
-
-        Args:
-            data (dict):
-                A dict mapping table name to table data.
-            metadata (sdv.metadata.multi_table.MultiTableMetadata):
-                The multi-table metadata.
-
-        Returns:
-            dict:
-                A mapping of table name to synthesizers.
-        """
-        self.metadata = metadata
-        synthesizers = {
-            table_name: self._get_trained_synthesizer(table, metadata.tables[table_name])
-            for table_name, table in data.items()
-        }
-        self.table_columns = {table_name: data[table_name].columns for table_name in data.keys()}
-
-        return synthesizers
-
-    def _get_foreign_keys(self, metadata, table_name, child_name):
-        foreign_keys = []
-        for relation in metadata.relationships:
-            if (
-                table_name == relation['parent_table_name']
-                and child_name == relation['child_table_name']
-            ):
-                foreign_keys.append(relation['child_foreign_key'])
-
-        return foreign_keys
-
-    def sample_from_synthesizer(self, synthesizers, n_samples):
-        """Sample from the given synthesizers.
-
-        Args:
-            synthesizers (dict):
-                A dict mapping table name to table synthesizer.
-            n_samples (int):
-                The number of samples.
-
-        Returns:
-            dict:
-                A mapping of table name to sampled table data.
-        """
-        tables = {
-            table_name: self._sample_from_synthesizer(synthesizer, n_samples)
-            for table_name, synthesizer in synthesizers.items()
-        }
-
-        for table_name, table in tables.items():
-            table_metadata = self.metadata.tables[table_name]
-            parents = list(table_metadata._get_parent_map().keys())
-            for parent_name in parents:
-                parent = tables[parent_name]
-                primary_key = self.metadata.tables[table_name].primary_key
-                foreign_keys = self._get_foreign_keys(self.metadata, parent_name, table_name)
-                for foreign_key in foreign_keys:
-                    foreign_key_values = parent[primary_key].sample(len(table), replace=True)
-                    table[foreign_key] = foreign_key_values.to_numpy()
-
-            tables[table_name] = table[self.table_columns[table_name]]
-
-        return tables