Add changes

fealho · fealho · commit 4c87a30f4af6 · 2025-12-03T08:51:34.000-08:00
Update tests

Fix init
diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py
@@ -1815,7 +1815,7 @@ def benchmark_multi_table(
     output_destination=None,
     show_progress=False,
 ):
-    """Run the SDGym benchmark on single-table datasets.
+    """Run the SDGym benchmark on multi-table datasets.
 
     Args:
         synthesizers (list[string]):
@@ -1827,8 +1827,8 @@ def benchmark_multi_table(
             or ``create_synthesizer_variant``). Defaults to ``None``.
         sdv_datasets (list[str] or ``None``):
             Names of the SDV demo datasets to use for the benchmark. Defaults to
-            ``[adult, alarm, census, child, expedia_hotel_logs, insurance, intrusion, news,
-            covtype]``. Use ``None`` to disable using any sdv datasets.
+            ``[NBA, financial, Student_loan, Biodegradability, fake_hotels, restbase,
+            airbnb-simplified]``. Use ``None`` to disable using any sdv datasets.
         additional_datasets_folder (str or ``None``):
             The path to a folder (local or an S3 bucket). Datasets found in this folder are
             run in addition to the SDV datasets. If ``None``, no additional datasets are used.
diff --git a/sdgym/result_explorer/result_explorer.py b/sdgym/result_explorer/result_explorer.py
@@ -57,14 +57,17 @@ def _resolve_effective_path(path, modality):
 class ResultsExplorer:
     """Explorer for SDGym benchmark results, supporting both local and S3 storage."""
 
-    def __init__(self, path, aws_access_key_id=None, aws_secret_access_key=None):
+    def __init__(self, path, modality, aws_access_key_id=None, aws_secret_access_key=None):
         self.path = path
+        self.modality = modality
         self.aws_access_key_id = aws_access_key_id
         self.aws_secret_access_key = aws_secret_access_key
 
         baseline_synthesizer = _get_baseline_synthesizer(modality)
         effective_path = _resolve_effective_path(path, modality)
         if is_s3_path(path):
+            # Use original path to obtain client (keeps backwards compatibility),
+            # but handler should operate on the modality-specific effective path.
             s3_client = _get_s3_client(path, aws_access_key_id, aws_secret_access_key)
             self._handler = S3ResultsHandler(
                 effective_path, s3_client, baseline_synthesizer=baseline_synthesizer
@@ -83,7 +86,11 @@ def _get_file_path(self, results_folder_name, dataset_name, synthesizer_name, fi
         """Validate access to the synthesizer or synthetic data file."""
         end_filename = f'{synthesizer_name}'
         if file_type == 'synthetic_data':
-            end_filename += '_synthetic_data.csv'
+            # Multi-table synthetic data is zipped (multiple CSVs), single table is CSV
+            if self.modality == 'multi_table':
+                end_filename += '_synthetic_data.zip'
+            else:
+                end_filename += '_synthetic_data.csv'
         elif file_type == 'synthesizer':
             end_filename += '.pkl'
 
@@ -108,14 +115,17 @@ def load_synthetic_data(self, results_folder_name, dataset_name, synthesizer_nam
 
     def load_real_data(self, dataset_name):
         """Load the real data for a given dataset."""
-        if dataset_name not in DEFAULT_SINGLE_TABLE_DATASETS:
+        # Keep strict validation for single_table to preserve existing behavior
+        if (self.modality is None or self.modality == 'single_table') and (
+            dataset_name not in DEFAULT_SINGLE_TABLE_DATASETS
+        ):
             raise ValueError(
                 f"Dataset '{dataset_name}' is not a SDGym dataset. "
                 'Please provide a valid dataset name.'
             )
 
         data, _ = load_dataset(
-            modality='single_table',
+            modality=self.modality or 'single_table',
             dataset=dataset_name,
             aws_access_key_id=self.aws_access_key_id,
             aws_secret_access_key=self.aws_secret_access_key,
diff --git a/sdgym/result_explorer/result_handler.py b/sdgym/result_explorer/result_handler.py
@@ -5,12 +5,16 @@
 import os
 from abc import ABC, abstractmethod
 from datetime import datetime
+from io import BytesIO
+from zipfile import ZipFile
 
 import cloudpickle
 import pandas as pd
 import yaml
 from botocore.exceptions import ClientError
 
+from sdgym._dataset_utils import _read_zipped_data
+
 SYNTHESIZER_BASELINE = 'GaussianCopulaSynthesizer'
 RESULTS_FOLDER_PREFIX = 'SDGym_results_'
 metainfo_PREFIX = 'metainfo'
@@ -270,8 +274,12 @@ def load_synthesizer(self, file_path):
             return cloudpickle.load(f)
 
     def load_synthetic_data(self, file_path):
-        """Load synthetic data from a CSV file."""
-        return pd.read_csv(os.path.join(self.base_path, file_path))
+        """Load synthetic data from a CSV or ZIP file."""
+        full_path = os.path.join(self.base_path, file_path)
+        if full_path.endswith('.zip'):
+            return _read_zipped_data(full_path, modality='multi_table')
+
+        return pd.read_csv(full_path)
 
     def _get_results_files(self, folder_name, prefix, suffix):
         return [
@@ -383,10 +391,21 @@ def load_synthesizer(self, file_path):
 
     def load_synthetic_data(self, file_path):
         """Load synthetic data from S3."""
-        response = self.s3_client.get_object(
-            Bucket=self.bucket_name, Key=f'{self.prefix}{file_path}'
-        )
-        return pd.read_csv(io.BytesIO(response['Body'].read()))
+        key = f'{self.prefix}{file_path}'
+        response = self.s3_client.get_object(Bucket=self.bucket_name, Key=key)
+        body = response['Body'].read()
+        if file_path.endswith('.zip'):
+            tables = {}
+            with ZipFile(BytesIO(body)) as zf:
+                for name in zf.namelist():
+                    if name.endswith('.csv'):
+                        table_name = os.path.splitext(os.path.basename(name))[0]
+                        with zf.open(name) as csv_file:
+                            tables[table_name] = pd.read_csv(csv_file, low_memory=False)
+
+            return tables
+
+        return pd.read_csv(io.BytesIO(body))
 
     def _get_results_files(self, folder_name, prefix, suffix):
         s3_prefix = f'{self.prefix}{folder_name}/'
diff --git a/tests/unit/result_explorer/test_result_explorer.py b/tests/unit/result_explorer/test_result_explorer.py
@@ -1,5 +1,6 @@
 import os
 import re
+import shutil
 from unittest.mock import Mock, patch
 
 import pandas as pd
@@ -76,6 +77,22 @@ def test__init__s3(self, mock_is_s3_path, mock_get_s3_client):
         assert result_explorer.aws_secret_access_key == aws_secret_access_key
         assert isinstance(result_explorer._handler, S3ResultsHandler)
 
+    def test_list_with_modality_local(self, tmp_path):
+        """Test the `list` method respects the modality subfolder (local)."""
+        # Setup
+        base = tmp_path / 'results'
+        (base / 'unscoped_run').mkdir(parents=True)
+        (base / 'multi_table' / 'run_mt1').mkdir(parents=True)
+        (base / 'multi_table' / 'run_mt2').mkdir(parents=True)
+
+        result_explorer = ResultsExplorer(str(base), modality='multi_table')
+
+        # Run
+        runs = result_explorer.list()
+
+        # Assert
+        assert set(runs) == {'run_mt1', 'run_mt2'}
+
     def test_list_local(self, tmp_path):
         """Test the `list` method with a local path"""
         # Setup
@@ -136,6 +153,28 @@ def test__get_file_path(self):
         )
         assert file_path == expected_filepath
 
+    def test__get_file_path_multi_table_synthetic_data(self, tmp_path):
+        """Test `_get_file_path` returns .zip for multi_table synthetic data."""
+        base = tmp_path / 'results'
+        multi_table_dir = base / 'multi_table'
+        multi_table_dir.mkdir(parents=True, exist_ok=True)
+        explorer = ResultsExplorer(str(multi_table_dir), modality='multi_table')
+        try:
+            explorer._handler = Mock()
+            explorer._handler.get_file_path.return_value = 'irrelevant'
+            explorer._get_file_path(
+                results_folder_name='results_folder_07_07_2025',
+                dataset_name='my_dataset',
+                synthesizer_name='my_synthesizer',
+                file_type='synthetic_data',
+            )
+            explorer._handler.get_file_path.assert_called_once_with(
+                ['results_folder_07_07_2025', 'my_dataset_07_07_2025', 'my_synthesizer'],
+                'my_synthesizer_synthetic_data.zip',
+            )
+        finally:
+            shutil.rmtree(multi_table_dir)
+
     def test_load_synthesizer(self, tmp_path):
         """Test `load_synthesizer` method."""
         # Setup
@@ -209,6 +248,31 @@ def test_load_real_data(self, mock_load_dataset, tmp_path):
         )
         pd.testing.assert_frame_equal(real_data, expected_data)
 
+    @patch('sdgym.result_explorer.result_explorer.load_dataset')
+    def test_load_real_data_multi_table(self, mock_load_dataset, tmp_path):
+        """Test `load_real_data` for multi_table modality calls load_dataset correctly."""
+        dataset_name = 'synthea'
+        expected_data = {'patients': pd.DataFrame({'id': [1]})}
+        mock_load_dataset.return_value = (expected_data, None)
+        multi_table_dir = tmp_path / 'multi_table'
+        multi_table_dir.mkdir(parents=True, exist_ok=True)
+        result_explorer = ResultsExplorer(tmp_path, modality='multi_table')
+
+        try:
+            # Run
+            real_data = result_explorer.load_real_data(dataset_name)
+
+            # Assert
+            mock_load_dataset.assert_called_once_with(
+                modality='multi_table',
+                dataset='synthea',
+                aws_access_key_id=None,
+                aws_secret_access_key=None,
+            )
+            assert real_data == expected_data
+        finally:
+            shutil.rmtree(multi_table_dir)
+
     def test_load_real_data_invalid_dataset(self, tmp_path):
         """Test `load_real_data` method with an invalid dataset."""
         # Setup
diff --git a/tests/unit/result_explorer/test_result_handler.py b/tests/unit/result_explorer/test_result_handler.py
@@ -1,3 +1,4 @@
+import io
 import os
 import pickle
 import re
@@ -326,6 +327,31 @@ def test_load_synthesizer(self, tmp_path):
         assert loaded_synthesizer is not None
         assert isinstance(loaded_synthesizer, GaussianCopulaSynthesizer)
 
+    def test_load_synthetic_data_zip(self, tmp_path):
+        """Test the `load_synthetic_data` method for zipped multi-table data (local)."""
+        # Setup
+        base = tmp_path / 'results'
+        data_dir = base / 'SDGym_results_07_07_2025' / 'dataset_07_07_2025' / 'Synth'
+        data_dir.mkdir(parents=True)
+
+        # Create a zip with two csvs
+        import zipfile
+
+        zip_path = data_dir / 'Synth_synthetic_data.zip'
+        with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr('table1.csv', 'a,b\n1,2\n')
+            zf.writestr('table2.csv', 'x,y\n3,4\n')
+
+        result_handler = LocalResultsHandler(str(base))
+
+        # Run
+        tables = result_handler.load_synthetic_data(str(zip_path))
+
+        # Assert
+        assert set(tables.keys()) == {'table1', 'table2'}
+        pd.testing.assert_frame_equal(tables['table1'], pd.DataFrame({'a': [1], 'b': [2]}))
+        pd.testing.assert_frame_equal(tables['table2'], pd.DataFrame({'x': [3], 'y': [4]}))
+
     @patch('os.path.exists')
     @patch('os.path.isfile')
     def test_get_file_path_local(self, mock_isfile, mock_exists):
@@ -466,6 +492,34 @@ def test_load_synthesizer(self):
             Bucket='my-bucket', Key='prefix/synthesizer.pkl'
         )
 
+    def test_load_synthetic_data_zip(self):
+        """Test the `load_synthetic_data` method for zipped multi-table data (S3)."""
+        # Setup
+        import zipfile
+
+        buffer = io.BytesIO()
+        with zipfile.ZipFile(buffer, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr('customers.csv', 'id,age\n1,30\n')
+            zf.writestr('transactions.csv', 'id,amount\n1,100\n')
+        buffer.seek(0)
+
+        mock_s3_client = Mock()
+        mock_s3_client.get_object.return_value = {'Body': Mock(read=lambda: buffer.getvalue())}
+        result_handler = S3ResultsHandler('s3://my-bucket/prefix', mock_s3_client)
+
+        # Run
+        tables = result_handler.load_synthetic_data('some/path.zip')
+
+        # Assert
+        assert set(tables.keys()) == {'customers', 'transactions'}
+        pd.testing.assert_frame_equal(tables['customers'], pd.DataFrame({'id': [1], 'age': [30]}))
+        pd.testing.assert_frame_equal(
+            tables['transactions'], pd.DataFrame({'id': [1], 'amount': [100]})
+        )
+        mock_s3_client.get_object.assert_called_once_with(
+            Bucket='my-bucket', Key='prefix/some/path.zip'
+        )
+
     def test_get_file_path_s3(self):
         """Test `get_file_path` for S3 path when folders and file exist."""
         # Setup