Skip to content

Commit 62a1951

Browse files
committed
address comments
1 parent 9bab838 commit 62a1951

File tree

11 files changed

+231
-191
lines changed

11 files changed

+231
-191
lines changed

.github/workflows/run_benchmark.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ jobs:
2222
- name: Install dependencies
2323
run: |
2424
python -m pip install --upgrade pip
25-
python -m pip install invoke
2625
python -m pip install -e .[dev]
2726
2827
- name: Run SDGym Benchmark

sdgym/_run_benchmark/__init__.py

Lines changed: 0 additions & 9 deletions
This file was deleted.

sdgym/_run_benchmark/_utils.py

Lines changed: 0 additions & 10 deletions
This file was deleted.

sdgym/_run_benchmark/upload_benchmark_results.py

Lines changed: 0 additions & 92 deletions
This file was deleted.
Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,53 @@
1+
"""Script to run a benchmark and upload results to S3."""
2+
13
import json
24
import os
35
from datetime import datetime, timezone
46

57
from botocore.exceptions import ClientError
68

7-
import sdgym._run_benchmark as run_benchmark
8-
from sdgym._run_benchmark._utils import get_run_name
99
from sdgym.benchmark import benchmark_single_table_aws
10+
from sdgym.run_benchmark.utils import (
11+
KEY_DATE_FILE,
12+
OUTPUT_DESTINATION_AWS,
13+
SYNTHESIZERS,
14+
get_result_folder_name,
15+
)
1016
from sdgym.s3 import get_s3_client, parse_s3_path
1117

1218

1319
def append_benchmark_run(aws_access_key_id, aws_secret_access_key, date_str):
20+
"""Append a new benchmark run to the benchmark dates file in S3."""
1421
s3_client = get_s3_client(
1522
aws_access_key_id=aws_access_key_id,
1623
aws_secret_access_key=aws_secret_access_key,
1724
)
18-
bucket, prefix = parse_s3_path(run_benchmark.OUTPUT_DESTINATION_AWS)
19-
key = '_BENCHMARK_DATES.json'
25+
bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
2026
try:
21-
object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{key}')
27+
object = s3_client.get_object(Bucket=bucket, Key=f'{prefix}{KEY_DATE_FILE}')
2228
body = object['Body'].read().decode('utf-8')
2329
data = json.loads(body)
2430
except ClientError as e:
2531
if e.response['Error']['Code'] == 'NoSuchKey':
2632
data = {'runs': []}
2733
else:
28-
raise RuntimeError(f'Failed to read {key} from S3: {e}')
34+
raise RuntimeError(f'Failed to read {KEY_DATE_FILE} from S3: {e}')
2935

30-
data['runs'].append({'date': date_str, 'run_name': get_run_name(date_str)})
36+
data['runs'].append({'date': date_str, 'folder_name': get_result_folder_name(date_str)})
3137
data['runs'] = sorted(data['runs'], key=lambda x: x['date'])
32-
s3_client.put_object(Bucket=bucket, Key=f'{prefix}{key}', Body=json.dumps(data).encode('utf-8'))
38+
s3_client.put_object(
39+
Bucket=bucket, Key=f'{prefix}{KEY_DATE_FILE}', Body=json.dumps(data).encode('utf-8')
40+
)
3341

3442

3543
def main():
44+
"""Main function to run the benchmark and upload results."""
3645
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
3746
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
3847
date_str = datetime.now(timezone.utc).strftime('%Y-%m-%d')
39-
for synthesizer in run_benchmark.SYNTHESIZERS:
48+
for synthesizer in SYNTHESIZERS:
4049
benchmark_single_table_aws(
41-
output_destination=run_benchmark.OUTPUT_DESTINATION_AWS,
50+
output_destination=OUTPUT_DESTINATION_AWS,
4251
aws_access_key_id=aws_access_key_id,
4352
aws_secret_access_key=aws_secret_access_key,
4453
synthesizers=[synthesizer],
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""Script to upload benchmark results to S3."""
2+
3+
import json
4+
import logging
5+
import os
6+
import sys
7+
8+
import boto3
9+
from botocore.exceptions import ClientError
10+
11+
from sdgym.result_writer import S3ResultsWriter
12+
from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS
13+
from sdgym.s3 import S3_REGION, parse_s3_path
14+
from sdgym.sdgym_result_explorer.result_explorer import SDGymResultsExplorer
15+
16+
LOGGER = logging.getLogger(__name__)
17+
18+
19+
def get_latest_run_from_file(s3_client, bucket, key):
20+
"""Get the latest run folder name from the benchmark dates file in S3."""
21+
try:
22+
object = s3_client.get_object(Bucket=bucket, Key=key)
23+
body = object['Body'].read().decode('utf-8')
24+
data = json.loads(body)
25+
latest = sorted(data['runs'], key=lambda x: x['date'])[-1]
26+
return latest['folder_name']
27+
except s3_client.exceptions.ClientError as e:
28+
raise RuntimeError(f'Failed to read {key} from S3: {e}')
29+
30+
31+
def write_uploaded_marker(s3_client, bucket, prefix, folder_name):
32+
"""Write a marker file to indicate that the upload is complete."""
33+
s3_client.put_object(
34+
Bucket=bucket, Key=f'{prefix}{folder_name}/upload_complete.marker', Body=b'Upload complete'
35+
)
36+
37+
38+
def upload_already_done(s3_client, bucket, prefix, folder_name):
39+
"""Check if the upload has already been done by looking for the marker file."""
40+
try:
41+
s3_client.head_object(Bucket=bucket, Key=f'{prefix}{folder_name}/upload_complete.marker')
42+
return True
43+
except ClientError as e:
44+
if e.response['Error']['Code'] == '404':
45+
return False
46+
47+
raise
48+
49+
50+
def get_result_folder_name_and_s3_vars(aws_access_key_id, aws_secret_access_key):
51+
"""Get the result folder name and S3 client variables."""
52+
bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
53+
s3_client = boto3.client(
54+
's3',
55+
aws_access_key_id=aws_access_key_id,
56+
aws_secret_access_key=aws_secret_access_key,
57+
region_name=S3_REGION,
58+
)
59+
folder_name = get_latest_run_from_file(s3_client, bucket, f'{prefix}_BENCHMARK_DATES.json')
60+
61+
return folder_name, s3_client, bucket, prefix
62+
63+
64+
def upload_results(
65+
aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix
66+
):
67+
"""Upload benchmark results to S3."""
68+
result_explorer = SDGymResultsExplorer(
69+
OUTPUT_DESTINATION_AWS,
70+
aws_access_key_id=aws_access_key_id,
71+
aws_secret_access_key=aws_secret_access_key,
72+
)
73+
result_writer = S3ResultsWriter(s3_client)
74+
75+
if not result_explorer.all_runs_complete(folder_name):
76+
LOGGER.warning(f'Run {folder_name} is not complete yet. Exiting.')
77+
sys.exit(0)
78+
79+
LOGGER.info(f'Run {folder_name} is complete! Proceeding with summarization...')
80+
summary, _ = result_explorer.summarize(folder_name)
81+
result_writer.write_dataframe(
82+
summary, f'{OUTPUT_DESTINATION_AWS}{folder_name}/{folder_name}_summary.csv', index=True
83+
)
84+
write_uploaded_marker(s3_client, bucket, prefix, folder_name)
85+
86+
87+
def main():
88+
"""Main function to upload benchmark results."""
89+
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
90+
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
91+
folder_name, s3_client, bucket, prefix = get_result_folder_name_and_s3_vars(
92+
aws_access_key_id, aws_secret_access_key
93+
)
94+
if upload_already_done(s3_client, bucket, prefix, folder_name):
95+
LOGGER.warning('Benchmark results have already been uploaded. Exiting.')
96+
sys.exit(0)
97+
98+
upload_results(aws_access_key_id, aws_secret_access_key, folder_name, s3_client, bucket, prefix)
99+
100+
101+
if __name__ == '__main__':
102+
main()

sdgym/run_benchmark/utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
"""Utils file for the run_benchmark module."""
2+
3+
from datetime import datetime
4+
5+
from sdgym.benchmark import SDV_SINGLE_TABLE_SYNTHESIZERS
6+
7+
OUTPUT_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/'
8+
UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/Issue_425/'
9+
DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug'
10+
SLACK_CHANNEL = 'sdv-alerts'
11+
KEY_DATE_FILE = '_BENCHMARK_DATES.json'
12+
SYNTHESIZERS = SDV_SINGLE_TABLE_SYNTHESIZERS
13+
14+
15+
def get_result_folder_name(date_str):
16+
"""Get the result folder name based on the date string."""
17+
try:
18+
date = datetime.strptime(date_str, '%Y-%m-%d')
19+
except ValueError:
20+
raise ValueError(f'Invalid date format: {date_str}. Expected YYYY-MM-DD.')
21+
22+
return f'SDGym_results_{date.month:02d}_{date.day:02d}_{date.year}'

tests/unit/_run_benchmark/test__utils.py

Lines changed: 0 additions & 14 deletions
This file was deleted.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import pytest
2+
3+
from sdgym.run_benchmark.utils import get_result_folder_name
4+
5+
6+
def test_get_result_folder_name():
7+
"""Test the `get_result_folder_name` method."""
8+
# Setup
9+
expected_error_message = 'Invalid date format: invalid-date. Expected YYYY-MM-DD.'
10+
11+
# Run and Assert
12+
assert get_result_folder_name('2023-10-01') == 'SDGym_results_10_01_2023'
13+
with pytest.raises(ValueError, match=expected_error_message):
14+
get_result_folder_name('invalid-date')

0 commit comments

Comments
 (0)