Add changes

fealho · fealho · commit ced519ebf676 · 2025-12-02T06:37:15.000-08:00
Update tests

Fix init
diff --git a/sdgym/result_explorer/result_explorer.py b/sdgym/result_explorer/result_explorer.py
@@ -14,20 +14,51 @@ def _validate_local_path(path):
         raise ValueError(f"The provided path '{path}' is not a valid local directory.")
 
 
+_FOLDER_BY_MODALITY = {
+    'single_table': 'single-table',
+    'multi_table': 'multi_table',
+}
+
+
+def _resolve_effective_path(path, modality):
+    """Append the modality folder to the given base path if provided."""
+    if not modality:
+        return path
+
+    folder = _FOLDER_BY_MODALITY.get(modality)
+    if folder is None:
+        valid = ', '.join(sorted(_FOLDER_BY_MODALITY))
+        raise ValueError(f'Invalid modality "{modality}". Valid options are: {valid}.')
+
+    # Avoid double-appending if already included
+    if str(path).rstrip('/').endswith(('/' + folder, folder)):
+        return path
+
+    if is_s3_path(path):
+        path = path.rstrip('/') + '/' + folder
+        return path
+
+    return os.path.join(path, folder)
+
+
 class ResultsExplorer:
     """Explorer for SDGym benchmark results, supporting both local and S3 storage."""
 
-    def __init__(self, path, aws_access_key_id=None, aws_secret_access_key=None):
+    def __init__(self, path, modality, aws_access_key_id=None, aws_secret_access_key=None):
         self.path = path
+        self.modality = modality
         self.aws_access_key_id = aws_access_key_id
         self.aws_secret_access_key = aws_secret_access_key
 
+        effective_path = _resolve_effective_path(path, modality)
         if is_s3_path(path):
+            # Use original path to obtain client (keeps backwards compatibility),
+            # but handler should operate on the modality-specific effective path.
             s3_client = _get_s3_client(path, aws_access_key_id, aws_secret_access_key)
-            self._handler = S3ResultsHandler(path, s3_client)
+            self._handler = S3ResultsHandler(effective_path, s3_client)
         else:
-            _validate_local_path(path)
-            self._handler = LocalResultsHandler(path)
+            _validate_local_path(effective_path)
+            self._handler = LocalResultsHandler(effective_path)
 
     def list(self):
         """List all runs available in the results directory."""
@@ -37,7 +68,11 @@ def _get_file_path(self, results_folder_name, dataset_name, synthesizer_name, fi
         """Validate access to the synthesizer or synthetic data file."""
         end_filename = f'{synthesizer_name}'
         if file_type == 'synthetic_data':
-            end_filename += '_synthetic_data.csv'
+            # Multi-table synthetic data is zipped (multiple CSVs), single table is CSV
+            if self.modality == 'multi_table':
+                end_filename += '_synthetic_data.zip'
+            else:
+                end_filename += '_synthetic_data.csv'
         elif file_type == 'synthesizer':
             end_filename += '.pkl'
 
@@ -62,14 +97,17 @@ def load_synthetic_data(self, results_folder_name, dataset_name, synthesizer_nam
 
     def load_real_data(self, dataset_name):
         """Load the real data for a given dataset."""
-        if dataset_name not in DEFAULT_SINGLE_TABLE_DATASETS:
+        # Keep strict validation for single_table to preserve existing behavior
+        if (self.modality is None or self.modality == 'single_table') and (
+            dataset_name not in DEFAULT_SINGLE_TABLE_DATASETS
+        ):
             raise ValueError(
                 f"Dataset '{dataset_name}' is not a SDGym dataset. "
                 'Please provide a valid dataset name.'
             )
 
         data, _ = load_dataset(
-            modality='single_table',
+            modality=self.modality or 'single_table',
             dataset=dataset_name,
             aws_access_key_id=self.aws_access_key_id,
             aws_secret_access_key=self.aws_secret_access_key,
diff --git a/sdgym/result_explorer/result_handler.py b/sdgym/result_explorer/result_handler.py
@@ -5,12 +5,16 @@
 import os
 from abc import ABC, abstractmethod
 from datetime import datetime
+from io import BytesIO
+from zipfile import ZipFile
 
 import cloudpickle
 import pandas as pd
 import yaml
 from botocore.exceptions import ClientError
 
+from sdgym._dataset_utils import _read_zipped_data
+
 SYNTHESIZER_BASELINE = 'GaussianCopulaSynthesizer'
 RESULTS_FOLDER_PREFIX = 'SDGym_results_'
 metainfo_PREFIX = 'metainfo'
@@ -262,8 +266,12 @@ def load_synthesizer(self, file_path):
             return cloudpickle.load(f)
 
     def load_synthetic_data(self, file_path):
-        """Load synthetic data from a CSV file."""
-        return pd.read_csv(os.path.join(self.base_path, file_path))
+        """Load synthetic data from a CSV or ZIP file."""
+        full_path = os.path.join(self.base_path, file_path)
+        if full_path.endswith('.zip'):
+            return _read_zipped_data(full_path, modality='multi_table')
+
+        return pd.read_csv(full_path)
 
     def _get_results_files(self, folder_name, prefix, suffix):
         return [
@@ -374,10 +382,21 @@ def load_synthesizer(self, file_path):
 
     def load_synthetic_data(self, file_path):
         """Load synthetic data from S3."""
-        response = self.s3_client.get_object(
-            Bucket=self.bucket_name, Key=f'{self.prefix}{file_path}'
-        )
-        return pd.read_csv(io.BytesIO(response['Body'].read()))
+        key = f'{self.prefix}{file_path}'
+        response = self.s3_client.get_object(Bucket=self.bucket_name, Key=key)
+        body = response['Body'].read()
+        if file_path.endswith('.zip'):
+            tables = {}
+            with ZipFile(BytesIO(body)) as zf:
+                for name in zf.namelist():
+                    if name.endswith('.csv'):
+                        table_name = os.path.splitext(os.path.basename(name))[0]
+                        with zf.open(name) as csv_file:
+                            tables[table_name] = pd.read_csv(csv_file, low_memory=False)
+
+            return tables
+
+        return pd.read_csv(io.BytesIO(body))
 
     def _get_results_files(self, folder_name, prefix, suffix):
         s3_prefix = f'{self.prefix}{folder_name}/'
diff --git a/tests/integration/result_explorer/test_result_explorer.py b/tests/integration/result_explorer/test_result_explorer.py
@@ -88,3 +88,71 @@ def test_summarize():
     expected_results['Win'] = expected_results['Win'].astype('int64')
     pd.testing.assert_frame_equal(summary, expected_summary)
     pd.testing.assert_frame_equal(results, expected_results)
+
+
+def test_summarize_multi_table(tmp_path):
+    """Test summarize works under the multi_table subfolder."""
+    # Setup: copy existing fixtures into multi_table folder
+    import shutil
+
+    src_root = 'tests/integration/result_explorer/_benchmark_results'
+    dst_root = tmp_path / 'benchmark_output' / 'multi_table'
+    dst_root.mkdir(parents=True, exist_ok=True)
+    for folder in [
+        'SDGym_results_04_05_2024',
+        'SDGym_results_05_10_2024',
+        'SDGym_results_10_11_2024',
+    ]:
+        shutil.copytree(f'{src_root}/{folder}', dst_root / folder)
+
+    result_explorer = ResultsExplorer(str(tmp_path / 'benchmark_output'), modality='multi_table')
+
+    # Run
+    summary, results = result_explorer.summarize('SDGym_results_10_11_2024')
+
+    # Assert
+    expected_summary = pd.DataFrame({
+        'Synthesizer': ['CTGANSynthesizer', 'CopulaGANSynthesizer', 'TVAESynthesizer'],
+        '10_11_2024 - # datasets: 9 - sdgym version: 0.9.1': [6, 4, 5],
+        '05_10_2024 - # datasets: 9 - sdgym version: 0.8.0': [4, 4, 5],
+        '04_05_2024 - # datasets: 9 - sdgym version: 0.7.0': [5, 3, 5],
+    })
+    expected_results = (
+        pd.read_csv(f'{src_root}/SDGym_results_10_11_2024/results.csv')
+        .sort_values(by=['Dataset', 'Synthesizer'])
+        .reset_index(drop=True)
+    )
+    expected_results['Win'] = expected_results['Win'].astype('int64')
+    pd.testing.assert_frame_equal(summary, expected_summary)
+    pd.testing.assert_frame_equal(results, expected_results)
+
+
+def test_list_and_load_results_multi_table(tmp_path):
+    """Test listing and loading results under multi_table subfolder."""
+    # Setup
+    import shutil
+
+    src_root = 'tests/integration/result_explorer/_benchmark_results/SDGym_results_10_11_2024'
+    dst_root = tmp_path / 'benchmark_output' / 'multi_table' / 'SDGym_results_10_11_2024'
+    shutil.copytree(src_root, dst_root)
+
+    explorer = ResultsExplorer(str(tmp_path / 'benchmark_output'), modality='multi_table')
+
+    # Run
+    runs = explorer.list()
+    assert runs == ['SDGym_results_10_11_2024']
+    loaded_results = (
+        explorer.load_results(runs[0])
+        .sort_values(by=['Dataset', 'Synthesizer'])
+        .reset_index(drop=True)
+    )
+    metainfo = explorer.load_metainfo(runs[0])
+
+    # Assert
+    expected_results = (
+        pd.read_csv(dst_root / 'results.csv')
+        .sort_values(by=['Dataset', 'Synthesizer'])
+        .reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(loaded_results, expected_results)
+    assert isinstance(metainfo, dict) and len(metainfo) >= 1
diff --git a/tests/unit/result_explorer/test_result_explorer.py b/tests/unit/result_explorer/test_result_explorer.py
@@ -1,4 +1,5 @@
 import re
+import shutil
 from unittest.mock import Mock, patch
 
 import pandas as pd
@@ -59,7 +60,11 @@ def test__init__s3(self, mock_is_s3_path, mock_get_s3_client):
         mock_get_s3_client.return_value = s3_client
 
         # Run
-        result_explorer = ResultsExplorer(path, aws_access_key_id, aws_secret_access_key)
+        result_explorer = ResultsExplorer(
+            path,
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+        )
 
         # Assert
         mock_is_s3_path.assert_called_once_with(path)
@@ -69,6 +74,22 @@ def test__init__s3(self, mock_is_s3_path, mock_get_s3_client):
         assert result_explorer.aws_secret_access_key == aws_secret_access_key
         assert isinstance(result_explorer._handler, S3ResultsHandler)
 
+    def test_list_with_modality_local(self, tmp_path):
+        """Test the `list` method respects the modality subfolder (local)."""
+        # Setup
+        base = tmp_path / 'results'
+        (base / 'unscoped_run').mkdir(parents=True)
+        (base / 'multi_table' / 'run_mt1').mkdir(parents=True)
+        (base / 'multi_table' / 'run_mt2').mkdir(parents=True)
+
+        result_explorer = ResultsExplorer(str(base), modality='multi_table')
+
+        # Run
+        runs = result_explorer.list()
+
+        # Assert
+        assert set(runs) == {'run_mt1', 'run_mt2'}
+
     def test_list_local(self, tmp_path):
         """Test the `list` method with a local path"""
         # Setup
@@ -129,6 +150,28 @@ def test__get_file_path(self):
         )
         assert file_path == expected_filepath
 
+    def test__get_file_path_multi_table_synthetic_data(self, tmp_path):
+        """Test `_get_file_path` returns .zip for multi_table synthetic data."""
+        base = tmp_path / 'results'
+        multi_table_dir = base / 'multi_table'
+        multi_table_dir.mkdir(parents=True, exist_ok=True)
+        explorer = ResultsExplorer(str(multi_table_dir), modality='multi_table')
+        try:
+            explorer._handler = Mock()
+            explorer._handler.get_file_path.return_value = 'irrelevant'
+            explorer._get_file_path(
+                results_folder_name='results_folder_07_07_2025',
+                dataset_name='my_dataset',
+                synthesizer_name='my_synthesizer',
+                file_type='synthetic_data',
+            )
+            explorer._handler.get_file_path.assert_called_once_with(
+                ['results_folder_07_07_2025', 'my_dataset_07_07_2025', 'my_synthesizer'],
+                'my_synthesizer_synthetic_data.zip',
+            )
+        finally:
+            shutil.rmtree(multi_table_dir)
+
     def test_load_synthesizer(self, tmp_path):
         """Test `load_synthesizer` method."""
         # Setup
@@ -196,6 +239,31 @@ def test_load_real_data(self, mock_load_dataset, tmp_path):
         )
         pd.testing.assert_frame_equal(real_data, expected_data)
 
+    @patch('sdgym.result_explorer.result_explorer.load_dataset')
+    def test_load_real_data_multi_table(self, mock_load_dataset, tmp_path):
+        """Test `load_real_data` for multi_table modality calls load_dataset correctly."""
+        dataset_name = 'synthea'
+        expected_data = {'patients': pd.DataFrame({'id': [1]})}
+        mock_load_dataset.return_value = (expected_data, None)
+        multi_table_dir = tmp_path / 'multi_table'
+        multi_table_dir.mkdir(parents=True, exist_ok=True)
+        result_explorer = ResultsExplorer(tmp_path, modality='multi_table')
+
+        try:
+            # Run
+            real_data = result_explorer.load_real_data(dataset_name)
+
+            # Assert
+            mock_load_dataset.assert_called_once_with(
+                modality='multi_table',
+                dataset='synthea',
+                aws_access_key_id=None,
+                aws_secret_access_key=None,
+            )
+            assert real_data == expected_data
+        finally:
+            shutil.rmtree(multi_table_dir)
+
     def test_load_real_data_invalid_dataset(self, tmp_path):
         """Test `load_real_data` method with an invalid dataset."""
         # Setup
diff --git a/tests/unit/result_explorer/test_result_handler.py b/tests/unit/result_explorer/test_result_handler.py
@@ -1,3 +1,4 @@
+import io
 import os
 import pickle
 import re
@@ -324,6 +325,31 @@ def test_load_synthesizer(self, tmp_path):
         assert loaded_synthesizer is not None
         assert isinstance(loaded_synthesizer, GaussianCopulaSynthesizer)
 
+    def test_load_synthetic_data_zip(self, tmp_path):
+        """Test the `load_synthetic_data` method for zipped multi-table data (local)."""
+        # Setup
+        base = tmp_path / 'results'
+        data_dir = base / 'SDGym_results_07_07_2025' / 'dataset_07_07_2025' / 'Synth'
+        data_dir.mkdir(parents=True)
+
+        # Create a zip with two csvs
+        import zipfile
+
+        zip_path = data_dir / 'Synth_synthetic_data.zip'
+        with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr('table1.csv', 'a,b\n1,2\n')
+            zf.writestr('table2.csv', 'x,y\n3,4\n')
+
+        result_handler = LocalResultsHandler(str(base))
+
+        # Run
+        tables = result_handler.load_synthetic_data(str(zip_path))
+
+        # Assert
+        assert set(tables.keys()) == {'table1', 'table2'}
+        pd.testing.assert_frame_equal(tables['table1'], pd.DataFrame({'a': [1], 'b': [2]}))
+        pd.testing.assert_frame_equal(tables['table2'], pd.DataFrame({'x': [3], 'y': [4]}))
+
     @patch('os.path.exists')
     @patch('os.path.isfile')
     def test_get_file_path_local(self, mock_isfile, mock_exists):
@@ -464,6 +490,34 @@ def test_load_synthesizer(self):
             Bucket='my-bucket', Key='prefix/synthesizer.pkl'
         )
 
+    def test_load_synthetic_data_zip(self):
+        """Test the `load_synthetic_data` method for zipped multi-table data (S3)."""
+        # Setup
+        import zipfile
+
+        buffer = io.BytesIO()
+        with zipfile.ZipFile(buffer, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr('customers.csv', 'id,age\n1,30\n')
+            zf.writestr('transactions.csv', 'id,amount\n1,100\n')
+        buffer.seek(0)
+
+        mock_s3_client = Mock()
+        mock_s3_client.get_object.return_value = {'Body': Mock(read=lambda: buffer.getvalue())}
+        result_handler = S3ResultsHandler('s3://my-bucket/prefix', mock_s3_client)
+
+        # Run
+        tables = result_handler.load_synthetic_data('some/path.zip')
+
+        # Assert
+        assert set(tables.keys()) == {'customers', 'transactions'}
+        pd.testing.assert_frame_equal(tables['customers'], pd.DataFrame({'id': [1], 'age': [30]}))
+        pd.testing.assert_frame_equal(
+            tables['transactions'], pd.DataFrame({'id': [1], 'amount': [100]})
+        )
+        mock_s3_client.get_object.assert_called_once_with(
+            Bucket='my-bucket', Key='prefix/some/path.zip'
+        )
+
     def test_get_file_path_s3(self):
         """Test `get_file_path` for S3 path when folders and file exist."""
         # Setup