Skip to content

Commit 223d2b2

Browse files
committed
update upload results
1 parent d718fc7 commit 223d2b2

File tree

12 files changed

+251
-65
lines changed

12 files changed

+251
-65
lines changed

.github/workflows/upload_benchmark_results.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,4 @@ jobs:
8888
run: |
8989
invoke notify-sdgym-benchmark-uploaded \
9090
--folder-name "$FOLDER_NAME" \
91-
--commit-url "$COMMIT_URL"
91+
--commit-url "$COMMIT_URL" --modality single_table
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
name: Upload SDGym Multi-Table Benchmark results
2+
3+
on:
4+
workflow_run:
5+
workflows: ["Run SDGym Benchmark Multi-Table"]
6+
types:
7+
- completed
8+
workflow_dispatch:
9+
schedule:
10+
- cron: '0 6 * * *'
11+
push:
12+
branches:
13+
- issue-516-add-workflows
14+
15+
jobs:
16+
upload-sdgym-benchmark:
17+
runs-on: ubuntu-latest
18+
19+
steps:
20+
- uses: actions/checkout@v4
21+
with:
22+
fetch-depth: 0
23+
24+
- name: Set up latest Python
25+
uses: actions/setup-python@v5
26+
with:
27+
python-version-file: 'pyproject.toml'
28+
29+
- name: Install dependencies
30+
run: |
31+
python -m pip install --upgrade pip
32+
python -m pip install --no-cache-dir -e .[dev]
33+
34+
- name: Upload SDGym Benchmark
35+
env:
36+
PYDRIVE_TOKEN: ${{ secrets.PYDRIVE_TOKEN }}
37+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
38+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
39+
GITHUB_LOCAL_RESULTS_DIR: ${{ runner.temp }}/sdgym-leaderboard-files
40+
run: |
41+
invoke upload-benchmark-results --modality multi_table
42+
echo "GITHUB_LOCAL_RESULTS_DIR=$GITHUB_LOCAL_RESULTS_DIR" >> $GITHUB_ENV
43+
44+
- name: Prepare files for commit
45+
if: env.SKIP_UPLOAD != 'true'
46+
run: |
47+
mkdir pr-staging
48+
echo "Looking for files in: $GITHUB_LOCAL_RESULTS_DIR"
49+
ls -l "$GITHUB_LOCAL_RESULTS_DIR"
50+
for f in "$GITHUB_LOCAL_RESULTS_DIR"/*; do
51+
if [ -f "$f" ]; then
52+
base=$(basename "$f")
53+
cp "$f" "pr-staging/${base}"
54+
fi
55+
done
56+
57+
echo "Files staged for PR:"
58+
ls -l pr-staging
59+
60+
- name: Checkout target repo (sdv-dev.github.io)
61+
if: env.SKIP_UPLOAD != 'true'
62+
run: |
63+
git clone https://github.com/sdv-dev/sdv-dev.github.io.git target-repo
64+
cd target-repo
65+
git checkout gatsby-home
66+
67+
- name: Copy results and commit
68+
if: env.SKIP_UPLOAD != 'true'
69+
env:
70+
GH_TOKEN: ${{ secrets.GH_TOKEN }}
71+
FOLDER_NAME: ${{ env.FOLDER_NAME }}
72+
run: |
73+
cp pr-staging/* target-repo/assets/sdgym-leaderboard-files/
74+
cd target-repo
75+
git checkout gatsby-home
76+
git config --local user.name "github-actions[bot]"
77+
git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
78+
git add assets/
79+
git commit -m "Upload SDGym Benchmark Results ($FOLDER_NAME)" || echo "No changes to commit"
80+
git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/sdv-dev/sdv-dev.github.io.git
81+
git push origin gatsby-home
82+
COMMIT_HASH=$(git rev-parse HEAD)
83+
COMMIT_URL="https://github.com/sdv-dev/sdv-dev.github.io/commit/${COMMIT_HASH}"
84+
echo "Commit URL: $COMMIT_URL"
85+
echo "COMMIT_URL=$COMMIT_URL" >> $GITHUB_ENV
86+
87+
- name: Send Slack notification
88+
if: env.SKIP_UPLOAD != 'true'
89+
env:
90+
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
91+
run: |
92+
invoke notify-sdgym-benchmark-uploaded \
93+
--folder-name "$FOLDER_NAME" \
94+
--commit-url "$COMMIT_URL" --modality multi_table

sdgym/_benchmark/credentials_utils.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
},
2222
'sdv': {'username', 'license_key'},
2323
}
24+
GCP_PROJECT_ID = 'sdgym-337614'
25+
GCP_ZONE = 'us-central1-a'
2426

2527

2628
def get_credentials(credential_filepath):
@@ -69,8 +71,7 @@ def sdv_install_cmd(credentials):
6971
return ''
7072

7173
return (
72-
'pip install bundle-xsynthesizers '
73-
f'--index-url https://{username}:{license_key}@pypi.datacebo.com'
74+
f'pip install sdv_enterprise --index-url https://{username}:{license_key}@pypi.datacebo.com'
7475
)
7576

7677

@@ -84,8 +85,8 @@ def create_credentials_file():
8485
},
8586
'gcp': {
8687
**json.loads(gcp_json),
87-
'gcp_project': 'sdgym-337614',
88-
'gcp_zone': 'us-central1-a',
88+
'gcp_project': GCP_PROJECT_ID,
89+
'gcp_zone': GCP_ZONE,
8990
},
9091
'sdv': {
9192
'username': os.getenv('SDV_ENTERPRISE_USERNAME'),
@@ -96,4 +97,5 @@ def create_credentials_file():
9697
tmp_file = NamedTemporaryFile(mode='w+', delete=False, suffix='.json')
9798
json.dump(credentials, tmp_file)
9899
tmp_file.flush()
100+
99101
return tmp_file.name

sdgym/result_explorer/result_explorer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def _validate_local_path(path):
2020

2121
_BASELINE_BY_MODALITY = {
2222
'single_table': SYNTHESIZER_BASELINE,
23-
'multi_table': 'MultiTableUniformSynthesizer',
23+
'multi_table': 'IdependentSynthesizer',
2424
}
2525

2626

sdgym/run_benchmark/run_benchmark.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Script to run a benchmark and upload results to S3."""
22

3-
import argparse
43
import json
54
import os
65
from datetime import datetime, timezone
@@ -16,6 +15,7 @@
1615
OUTPUT_DESTINATION_AWS,
1716
SYNTHESIZERS_SPLIT_MULTI_TABLE,
1817
SYNTHESIZERS_SPLIT_SINGLE_TABLE,
18+
_parse_args,
1919
get_result_folder_name,
2020
post_benchmark_launch_message,
2121
)
@@ -61,17 +61,6 @@ def append_benchmark_run(
6161
)
6262

6363

64-
def _parse_args():
65-
parser = argparse.ArgumentParser()
66-
parser.add_argument(
67-
'--modality',
68-
choices=['single_table', 'multi_table'],
69-
default='single_table',
70-
help='Benchmark modality to run.',
71-
)
72-
return parser.parse_args()
73-
74-
7564
def main():
7665
"""Main function to run the benchmark and upload results."""
7766
args = _parse_args()

sdgym/run_benchmark/upload_benchmark_results.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from sdgym.result_explorer.result_explorer import ResultsExplorer
1818
from sdgym.result_writer import LocalResultsWriter
19-
from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, get_df_to_plot
19+
from sdgym.run_benchmark.utils import OUTPUT_DESTINATION_AWS, _parse_args, get_df_to_plot
2020
from sdgym.s3 import S3_REGION, parse_s3_path
2121

2222
LOGGER = logging.getLogger(__name__)
@@ -45,17 +45,21 @@ def get_latest_run_from_file(s3_client, bucket, key):
4545
raise RuntimeError(f'Failed to read {key} from S3: {e}')
4646

4747

48-
def write_uploaded_marker(s3_client, bucket, prefix, folder_name):
48+
def write_uploaded_marker(s3_client, bucket, prefix, folder_name, modality='single_table'):
4949
"""Write a marker file to indicate that the upload is complete."""
5050
s3_client.put_object(
51-
Bucket=bucket, Key=f'{prefix}{folder_name}/upload_complete.marker', Body=b'Upload complete'
51+
Bucket=bucket,
52+
Key=f'{prefix}{modality}/{folder_name}/upload_complete.marker',
53+
Body=b'Upload complete',
5254
)
5355

5456

55-
def upload_already_done(s3_client, bucket, prefix, folder_name):
57+
def upload_already_done(s3_client, bucket, prefix, folder_name, modality='single_table'):
5658
"""Check if the upload has already been done by looking for the marker file."""
5759
try:
58-
s3_client.head_object(Bucket=bucket, Key=f'{prefix}{folder_name}/upload_complete.marker')
60+
s3_client.head_object(
61+
Bucket=bucket, Key=f'{prefix}{modality}/{folder_name}/upload_complete.marker'
62+
)
5963
return True
6064
except ClientError as e:
6165
if e.response['Error']['Code'] == '404':
@@ -64,7 +68,9 @@ def upload_already_done(s3_client, bucket, prefix, folder_name):
6468
raise
6569

6670

67-
def get_result_folder_name_and_s3_vars(aws_access_key_id, aws_secret_access_key):
71+
def get_result_folder_name_and_s3_vars(
72+
aws_access_key_id, aws_secret_access_key, modality='single_table'
73+
):
6874
"""Get the result folder name and S3 client variables."""
6975
bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
7076
s3_client = boto3.client(
@@ -73,7 +79,9 @@ def get_result_folder_name_and_s3_vars(aws_access_key_id, aws_secret_access_key)
7379
aws_secret_access_key=aws_secret_access_key,
7480
region_name=S3_REGION,
7581
)
76-
folder_infos = get_latest_run_from_file(s3_client, bucket, f'{prefix}_BENCHMARK_DATES.json')
82+
folder_infos = get_latest_run_from_file(
83+
s3_client, bucket, f'{prefix}{modality}/_BENCHMARK_DATES.json'
84+
)
7785

7886
return folder_infos, s3_client, bucket, prefix
7987

@@ -109,14 +117,21 @@ def upload_to_drive(file_path, file_id):
109117

110118

111119
def upload_results(
112-
aws_access_key_id, aws_secret_access_key, folder_infos, s3_client, bucket, prefix, github_env
120+
aws_access_key_id,
121+
aws_secret_access_key,
122+
folder_infos,
123+
s3_client,
124+
bucket,
125+
prefix,
126+
github_env,
127+
modality='single_table',
113128
):
114129
"""Upload benchmark results to S3, GDrive, and save locally."""
115130
folder_name = folder_infos['folder_name']
116131
run_date = folder_infos['date']
117132
result_explorer = ResultsExplorer(
118133
OUTPUT_DESTINATION_AWS,
119-
modality='single_table',
134+
modality=modality,
120135
aws_access_key_id=aws_access_key_id,
121136
aws_secret_access_key=aws_secret_access_key,
122137
)
@@ -145,7 +160,7 @@ def upload_results(
145160

146161
Path(local_export_dir).mkdir(parents=True, exist_ok=True)
147162
local_file_path = str(Path(local_export_dir) / RESULT_FILENAME)
148-
s3_key = f'{prefix}{RESULT_FILENAME}'
163+
s3_key = f'{prefix}{modality}/{RESULT_FILENAME}'
149164
s3_client.download_file(bucket, s3_key, local_file_path)
150165
datas = {
151166
'Wins': summary,
@@ -155,20 +170,22 @@ def upload_results(
155170
local_results_writer.write_xlsx(datas, local_file_path)
156171
upload_to_drive((local_file_path), SDGYM_FILE_ID)
157172
s3_client.upload_file(local_file_path, bucket, s3_key)
158-
write_uploaded_marker(s3_client, bucket, prefix, folder_name)
173+
write_uploaded_marker(s3_client, bucket, prefix, folder_name, modality=modality)
159174
if temp_dir:
160175
shutil.rmtree(temp_dir)
161176

162177

163178
def main():
164179
"""Main function to upload benchmark results."""
180+
args = _parse_args()
181+
modality = args.modality
165182
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
166183
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
167184
folder_infos, s3_client, bucket, prefix = get_result_folder_name_and_s3_vars(
168-
aws_access_key_id, aws_secret_access_key
185+
aws_access_key_id, aws_secret_access_key, modality=modality
169186
)
170187
github_env = os.getenv('GITHUB_ENV')
171-
if upload_already_done(s3_client, bucket, prefix, folder_infos['folder_name']):
188+
if upload_already_done(s3_client, bucket, prefix, folder_infos['folder_name'], modality):
172189
LOGGER.warning('Benchmark results have already been uploaded. Exiting.')
173190
if github_env:
174191
with open(github_env, 'a') as env_file:
@@ -184,6 +201,7 @@ def main():
184201
bucket,
185202
prefix,
186203
github_env,
204+
modality,
187205
)
188206

189207

sdgym/run_benchmark/utils.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Utils file for the run_benchmark module."""
22

3+
import argparse
34
import os
45
from datetime import datetime
56
from urllib.parse import quote_plus
@@ -12,7 +13,7 @@
1213
OUTPUT_DESTINATION_AWS = (
1314
's3://sdgym-benchmark/Debug/GCP_Github/' # 's3://sdgym-benchmark/Benchmarks/'
1415
)
15-
UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Benchmarks/'
16+
UPLOAD_DESTINATION_AWS = 's3://sdgym-benchmark/Debug/GCP/'
1617
DEBUG_SLACK_CHANNEL = 'sdv-alerts-debug'
1718
SLACK_CHANNEL = 'sdv-alerts'
1819
KEY_DATE_FILE = '_BENCHMARK_DATES.json'
@@ -48,6 +49,10 @@
4849
'diamond-cross',
4950
'diamond-x',
5051
]
52+
MODALITY_TO_GDRIVE_LINK = {
53+
'single_table': 'https://docs.google.com/spreadsheets/d/1W3tsGOOtbtTw3g0EVE0irLgY_TN_cy2W4ONiZQ57OPo/edit?usp=sharing',
54+
'multi_table': 'https://docs.google.com/spreadsheets/d/1R13RktVvKnxRecYIge07OBpbX1vbEkE2D1_2idNAKSY/edit?usp=sharing',
55+
}
5156

5257
# The synthesizers inside the same list will be run by the same ec2 instance
5358
SYNTHESIZERS_SPLIT_SINGLE_TABLE = [
@@ -103,20 +108,22 @@ def post_benchmark_launch_message(date_str, compute_service='AWS', modality='sin
103108
folder_name = get_result_folder_name(date_str)
104109
bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
105110
url_link = get_s3_console_link(bucket, f'{prefix}{modality}/{folder_name}/')
106-
body = f'🏃 SDGym benchmark has been launched on {compute_service}! '
111+
modality_text = modality.replace('_', '-')
112+
body = f'🏃 SDGym {modality_text} benchmark has been launched on {compute_service}!\n'
107113
body += f'Intermediate results can be found <{url_link}|here>.\n'
108114
post_slack_message(channel, body)
109115

110116

111117
def post_benchmark_uploaded_message(folder_name, commit_url=None, modality='single_table'):
112118
"""Post benchmark uploaded message to sdv-alerts slack channel."""
113-
channel = SLACK_CHANNEL
119+
channel = DEBUG_SLACK_CHANNEL
114120
bucket, prefix = parse_s3_path(OUTPUT_DESTINATION_AWS)
121+
modality_text = modality.replace('_', '-')
115122
url_link = get_s3_console_link(bucket, quote_plus(f'{prefix}{modality}/SDGym Monthly Run.xlsx'))
116123
body = (
117-
f'🤸🏻‍♀️ SDGym benchmark results for *{folder_name}* are available! 🏋️‍♀️\n'
124+
f'🤸🏻‍♀️ SDGym {modality_text} benchmark results for *{folder_name}* are available! 🏋️‍♀️\n'
118125
f'Check the results:\n'
119-
f' - On GDrive: <{GDRIVE_LINK}|link>\n'
126+
f' - On GDrive: <{MODALITY_TO_GDRIVE_LINK[modality]}|link>\n'
120127
f' - On S3: <{url_link}|link>\n'
121128
)
122129
if commit_url:
@@ -168,3 +175,14 @@ def get_df_to_plot(benchmark_result):
168175
df_to_plot = df_to_plot.rename(columns={'Adjusted_Quality_Score': 'Quality_Score'})
169176

170177
return df_to_plot.drop(columns=['Cumulative Quality Score']).reset_index(drop=True)
178+
179+
180+
def _parse_args():
181+
parser = argparse.ArgumentParser()
182+
parser.add_argument(
183+
'--modality',
184+
choices=['single_table', 'multi_table'],
185+
default='single_table',
186+
help='Benchmark modality to run.',
187+
)
188+
return parser.parse_args()

tasks.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,13 +208,13 @@ def run_sdgym_benchmark(c, modality='single_table'):
208208
c.run(f'python sdgym/run_benchmark/run_benchmark.py --modality {modality}')
209209

210210
@task
211-
def upload_benchmark_results(c):
211+
def upload_benchmark_results(c, modality='single_table'):
212212
"""Upload the benchmark results to S3."""
213-
c.run(f'python sdgym/run_benchmark/upload_benchmark_results.py')
213+
c.run(f'python sdgym/run_benchmark/upload_benchmark_results.py --modality {modality}')
214214

215215
@task
216-
def notify_sdgym_benchmark_uploaded(c, folder_name, commit_url=None):
216+
def notify_sdgym_benchmark_uploaded(c, folder_name, commit_url=None, modality='single_table'):
217217
"""Notify Slack about the SDGym benchmark upload."""
218218
from sdgym.run_benchmark.utils import post_benchmark_uploaded_message
219219

220-
post_benchmark_uploaded_message(folder_name, commit_url)
220+
post_benchmark_uploaded_message(folder_name, commit_url, modality)

0 commit comments

Comments
 (0)