cleaning 515

R-Palazzo · R-Palazzo · commit 53bc640831f5 · 2025-12-17T17:55:03.000+01:00
diff --git a/.github/workflows/run_benchmark_multi_table.yml b/.github/workflows/run_benchmark_multi_table.yml
diff --git a/sdgym/_benchmark/config_utils.py b/sdgym/_benchmark/config_utils.py
@@ -20,7 +20,7 @@
         ),
         'gpu_type': 'nvidia-tesla-t4',
         'gpu_count': 1,
-        'install_nvidia_driver': False,  # DLVM already has drivers/tooling
+        'install_nvidia_driver': False,
         'delete_on_success': True,
         'delete_on_error': True,
         'stop_fallback': True,
diff --git a/sdgym/run_benchmark/run_benchmark.py b/sdgym/run_benchmark/run_benchmark.py
@@ -1,40 +1,31 @@
 """Script to run a benchmark and upload results to S3."""
 
-import argparse
-import base64
 import json
 import os
 from datetime import datetime, timezone
-from pathlib import Path
 
 from botocore.exceptions import ClientError
 
-from sdgym._benchmark.benchmark import _benchmark_multi_table_compute_gcp
 from sdgym.benchmark import benchmark_single_table_aws
 from sdgym.run_benchmark.utils import (
-    GCP_PROJECT,
-    GCP_ZONE,
     KEY_DATE_FILE,
     OUTPUT_DESTINATION_AWS,
-    SYNTHESIZERS_SPLIT_MULTI_TABLE,
-    SYNTHESIZERS_SPLIT_SINGLE_TABLE,
+    SYNTHESIZERS_SPLIT,
     get_result_folder_name,
     post_benchmark_launch_message,
 )
 from sdgym.s3 import get_s3_client, parse_s3_path
 
 
-def append_benchmark_run(
-    aws_access_key_id, aws_secret_access_key, date_str, modality='single_table'
-):
+def append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str):
     """Append a new benchmark run to the benchmark dates file in S3."""
     s3_client = get_s3_client(
         aws_access_key_id=aws_access_key_id,
         aws_secret_access_key=aws_secret_access_key,
     )
     bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
     try:
-        object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{modality}{KEY_DATE_FILE}')
+        object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{KEY_DATE_FILE}')
         body = object['Body'].read().decode('utf-8')
         data = json.loads(body)
     except ClientError as e:
@@ -50,116 +41,23 @@ def append_benchmark_run(
     )
 
 
-def _load_gcp_service_account_from_env():
-    """Load GCP service account JSON from env.
-
-    Supports:
-      - raw JSON string
-      - base64-encoded JSON string
-    """
-    raw = os.getenv('GCP_SERVICE_ACCOUNT_JSON', '') or ''
-    if not raw.strip():
-        return {}
-
-    try:
-        return json.loads(raw)
-    except json.JSONDecodeError:
-        decoded = base64.b64decode(raw).decode('utf-8')
-        return json.loads(decoded)
-
-
-def create_credentials_file(filepath):
-    """Create credentials file used by the benchmark launcher."""
-    gcp_sa = _load_gcp_service_account_from_env()
-
-    credentials = {
-        'aws': {
-            'aws_access_key_id': os.getenv('AWS_ACCESS_KEY_ID'),
-            'aws_secret_access_key': os.getenv('AWS_SECRET_ACCESS_KEY'),
-        },
-        'gcp': {
-            **gcp_sa,
-            'gcp_project': GCP_PROJECT,
-            'gcp_zone': GCP_ZONE,
-        },
-        'sdv': {
-            'username': os.getenv('SDV_ENTERPRISE_USERNAME'),
-            'license_key': os.getenv('SDV_ENTERPRISE_LICENSE_KEY'),
-        },
-    }
-
-    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
-    with open(filepath, 'w', encoding='utf-8') as f:
-        json.dump(credentials, f, indent=2, sort_keys=True)
-        f.write('\n')
-
-
-def _parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--modality',
-        choices=['single_table', 'multi_table'],
-        default='single_table',
-        help='Benchmark modality to run.',
-    )
-    parser.add_argument(
-        '--gcp-output-destination',
-        default='s3://sdgym-benchmark/Debug/GCP/',
-        help='Where to store GCP benchmark results (S3).',
-    )
-    return parser.parse_args()
-
-
 def main():
     """Main function to run the benchmark and upload results."""
-    args = _parse_args()
-
     aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
     aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
     date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d')
-
-    if args.modality == 'single_table':
-        for synthesizer_group in SYNTHESIZERS_SPLIT_SINGLE_TABLE:
-            benchmark_single_table_aws(
-                output_destination=OUTPUT_DESTINATION_AWS,
-                aws_access_key_id=aws_access_key_id,
-                aws_secret_access_key=aws_secret_access_key,
-                synthesizers=synthesizer_group,
-                compute_privacy_score=False,
-                timeout=345600,  # 4 days
-            )
-
-        append_benchmark_run(
-            aws_access_key_id,
-            aws_secret_access_key,
-            date_str,
-            modality='single_table',
-        )
-        compute_service = 'AWS'
-
-    else:
-        runner_temp = os.environ.get('RUNNER_TEMP', '/tmp')
-        cred_path = os.path.join(runner_temp, 'credentials.json')
-        create_credentials_file(cred_path)
-
-        for synthesizer_group in SYNTHESIZERS_SPLIT_MULTI_TABLE:
-            _benchmark_multi_table_compute_gcp(
-                output_destination=args.gcp_output_destination,
-                credential_filepath=cred_path,
-                synthesizers=synthesizer_group,
-                compute_privacy_score=False,
-                timeout=345600,  # 4 days
-            )
-
-        append_benchmark_run(
-            aws_access_key_id,
-            aws_secret_access_key,
-            date_str,
-            modality='multi_table',
+    for synthesizer_group in SYNTHESIZERS_SPLIT:
+        benchmark_single_table_aws(
+            output_destination=OUTPUT_DESTINATION_AWS,
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+            synthesizers=synthesizer_group,
+            compute_privacy_score=False,
+            timeout=345600,  # 4 days
         )
-        compute_service = 'GCP'
 
-    post_benchmark_launch_message(date_str, compute_service=compute_service)
+    append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str)
+    post_benchmark_launch_message(date_str)
 
 
 if __name__ == '__main__':
diff --git a/sdgym/run_benchmark/utils.py b/sdgym/run_benchmark/utils.py
@@ -9,8 +9,6 @@
 
 from sdgym.s3 import parse_s3_path
 
-GCP_ZONE = 'us-central1-a'
-GCP_PROJECT = 'sdgym-337614'
 OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Benchmarks/'
 UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Benchmarks/'
 DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug'
@@ -50,16 +48,12 @@
 ]
 
 # The synthesizers inside the same list will be run by the same ec2 instance
-SYNTHESIZERS_SPLIT_SINGLE_TABLE = [
+SYNTHESIZERS_SPLIT = [
     ['UniformSynthesizer', 'ColumnSynthesizer', 'GaussianCopulaSynthesizer', 'TVAESynthesizer'],
     ['CopulaGANSynthesizer'],
     ['CTGANSynthesizer'],
     ['RealTabFormerSynthesizer'],
 ]
-SYNTHESIZERS_SPLIT_MULTI_TABLE = [
-    ['HMASynthesizer'],
-    ['HSASynthesizer', 'IndependentSynthesizer', 'MultiTableUniformSynthesizer'],
-]
 
 
 def get_result_folder_name(date_str):
@@ -97,13 +91,13 @@ def post_slack_message(channel, text):
     client.chat_postMessage(channel=channel, text=text)
 
 
-def post_benchmark_launch_message(date_str, compute_service='AWS'):
+def post_benchmark_launch_message(date_str):
     """Post a message to the SDV Alerts Slack channel when the benchmark is launched."""
     channel = SLACK_CHANNEL
     folder_name = get_result_folder_name(date_str)
     bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
     url_link = get_s3_console_link(bucket, f'{prefix}{folder_name}/')
-    body = f'🏃 SDGym benchmark has been launched on {compute_service}! '
+    body = '🏃 SDGym benchmark has been launched! EC2 Instances are running. '
     body += f'Intermediate results can be found <{url_link}|here>.\n'
     post_slack_message(channel, body)
 
diff --git a/tasks.py b/tasks.py
@@ -203,9 +203,9 @@ def rmdir(c, path):
         pass
 
 @task
-def run_sdgym_benchmark(c, modality='single_table'):
+def run_sdgym_benchmark(c):
     """Run the SDGym benchmark."""
-    c.run(f'python sdgym/run_benchmark/run_benchmark.py --modality {modality}')
+    c.run('python sdgym/run_benchmark/run_benchmark.py')
 
 @task
 def upload_benchmark_results(c):
diff --git a/tests/unit/run_benchmark/test_run_benchmark.py b/tests/unit/run_benchmark/test_run_benchmark.py
@@ -5,7 +5,7 @@
 from botocore.exceptions import ClientError
 
 from sdgym.run_benchmark.run_benchmark import append_benchmark_run, main
-from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, SYNTHESIZERS_SPLIT_SINGLE_TABLE
+from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, SYNTHESIZERS_SPLIT
 
 
 @patch('sdgym.run_benchmark.run_benchmark.get_s3_client')
@@ -122,7 +122,7 @@ def test_main(
     mock_getenv.assert_any_call('AWS_ACCESS_KEY_ID')
     mock_getenv.assert_any_call('AWS_SECRET_ACCESS_KEY')
     expected_calls = []
-    for synthesizer in SYNTHESIZERS_SPLIT_SINGLE_TABLE:
+    for synthesizer in SYNTHESIZERS_SPLIT:
         expected_calls.append(
             call(
                 output_destination=OUTPUT_DESTINATION_AWS,