Skip to content

Commit ed8e5ed

Browse files
authored
Merge pull request #112 from duckdblabs/add_trigger_to_run_the_benchmark
Add action to run the benchmark
2 parents cecf694 + 26eeabe commit ed8e5ed

File tree

22 files changed

+339
-51
lines changed

22 files changed

+339
-51
lines changed

.github/workflows/RunBenchmark.yml

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
name: Run benchmark
2+
on:
3+
workflow_dispatch:
4+
5+
concurrency:
6+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }}
7+
cancel-in-progress: true
8+
9+
env:
10+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
11+
gh_issue_repo: duckdblabs/db-benchmark
12+
instance_id: i-0e75a4f0e0b61c475
13+
14+
start-aws-machine:
15+
name: Start aws-small-machine
16+
runs-on: ubuntu-latest
17+
18+
steps:
19+
- name: Start EC2 runner
20+
shell: bash
21+
env:
22+
AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}}
23+
AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}}
24+
AWS_DEFAULT_REGION: us-east-1
25+
run: aws ec2 start-instances --instance-id ${{ env.instance_id }}
26+
27+
- name: Create issue if failure
28+
shell: bash
29+
if: ${{ failure() && contains(github.ref_name, 'main') }}
30+
run: |
31+
gh issue create --repo ${{ env.gh_issue_repo }} --title "Weekly Regression Test Failure" --body "AWS box with instance-id ${{ env.instance_id }} could not be started"
32+
33+
jobs:
34+
run-benchmark:
35+
name: Regression Tests all solutions
36+
env:
37+
CC: gcc-10
38+
CXX: g++-10
39+
GEN: ninja
40+
runs-on: self-hosted
41+
permissions: # Job-level permissions configuration starts here
42+
contents: write # 'write' access to repository contents
43+
pull-requests: write # 'write' access to pull requests
44+
steps:
45+
- uses: actions/checkout@v4
46+
with:
47+
fetch-depth: 0
48+
ref: add_trigger_to_run_the_benchmark
49+
persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token.
50+
51+
- name: run mount
52+
shell: bash
53+
run: |
54+
./_setup_utils/mount.sh
55+
56+
# TODO: Remove this.
57+
- name: checkout correct branch
58+
shell: bash
59+
working-directory: /var/lib/mount/db-benchmark-metal
60+
run: |
61+
git checkout add_trigger_to_run_the_benchmark
62+
63+
- name: Install or Upgrade all solutions
64+
shell: bash
65+
working-directory: /var/lib/mount/db-benchmark-metal
66+
run: |
67+
python3 _setup_utils/install_all_solutions.py all --exclude clickhouse
68+
# installing/updating clickhouse needs sudo priviledges
69+
sudo python3 _setup_utils/install_all_solutions.py clickhouse
70+
71+
- name: Modify run.conf to only have new versions
72+
shell: bash
73+
working-directory: /var/lib/mount/db-benchmark-metal
74+
run: |
75+
git diff --name-only **/VERSION > updated_solutions.txt
76+
sed -i 's/\/VERSION/ /g' updated_solutions.txt
77+
tr --delete '\n' < updated_solutions.txt > updated_solutions_no_newlines.txt
78+
export new_solutions=`cat updated_solutions_no_newlines.txt`
79+
echo "New benchmarks are " $new_solutions
80+
sed -i "s/export RUN_SOLUTIONS=.*/export RUN_SOLUTIONS=\"${new_solutions}\"/g" run.conf
81+
82+
- name: Run the benchmark
83+
shell: bash
84+
working-directory: /var/lib/mount/db-benchmark-metal
85+
run: |
86+
ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
87+
if [ $ncores -eq 16 ]; then export MACHINE_TYPE="c6id.4xlarge"; fi
88+
if [ $ncores -eq 32 ]; then export MACHINE_TYPE="c6id.8xlarge"; fi
89+
if [ $ncores -eq 128 ]; then export MACHINE_TYPE="c6id.metal"; fi
90+
MACHINE_TYPE=$MACHINE_TYPE ./_run/run_small_medium.sh
91+
92+
- name: name new branch
93+
shell: bash
94+
run: |
95+
echo "new_branch_name=results-`date +%Y-%m-%d-%Hh%Mm`" >> $GITHUB_ENV
96+
echo ${{ env.new_branch_name }}
97+
98+
- name: Commit updates
99+
shell: bash
100+
working-directory: /var/lib/mount/db-benchmark-metal
101+
run: |
102+
git config --global user.email ""
103+
git config --global user.name "Run Benchmark action"
104+
git remote add upstream git@github.com:duckdblabs/db-benchmark
105+
git fetch upstream
106+
git switch -c ${{ env.new_branch_name }}
107+
git add time.csv logs.csv **/VERSION
108+
git add run.conf
109+
git commit -m "new results"
110+
git push upstream ${{ env.new_branch_name }}
111+
112+
- name: Create Archive
113+
if: always()
114+
shell: bash
115+
working-directory: /var/lib/mount/db-benchmark-metal
116+
run: |
117+
mkdir -p out
118+
echo "guarantee not empty dir" > out/guarantee.txt
119+
zip -r out-dir.zip out/
120+
121+
- uses: actions/upload-artifact@v4
122+
if: always()
123+
with:
124+
name: out-dir.zip
125+
path: /var/lib/mount/db-benchmark-metal/out-dir.zip
126+
if-no-files-found: error
127+
128+
shutdown:
129+
name: shut down
130+
if: always()
131+
runs-on: ubuntu-latest
132+
needs:
133+
- start-runner
134+
- run-benchmark
135+
136+
steps:
137+
- name: shutdown
138+
shell: bash
139+
env:
140+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
141+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
142+
AWS_DEFAULT_REGION: us-east-1
143+
run: aws ec2 stop-instances --instance-id ${{ env.instance_id }}
144+

_control/skipped_benchmarks.csv

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
solution,task,data,machine_type
2+
juliads,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
3+
juliads,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
4+
juliads,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
5+
juliads,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
6+
juliads,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
7+
juliadf,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
8+
juliadf,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
9+
juliadf,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
10+
juliadf,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
11+
juliadf,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
12+
R-arrow,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
13+
R-arrow,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
14+
R-arrow,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
15+
R-arrow,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
16+
R-arrow,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
17+
dplyr,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
18+
dplyr,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
19+
dplyr,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
20+
dplyr,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
21+
dplyr,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
22+
pandas,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
23+
pandas,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
24+
pandas,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
25+
pandas,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
26+
pandas,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
27+
pydatatable,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
28+
pydatatable,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
29+
pydatatable,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
30+
pydatatable,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
31+
pydatatable,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
32+
spark,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
33+
spark,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
34+
spark,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
35+
spark,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
36+
spark,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
37+
datafusion,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
38+
datafusion,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
39+
datafusion,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
40+
datafusion,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
41+
datafusion,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
42+
datafusion,join,J1_1e8_NA_0_0,c6id.4xlarge
43+
datafusion,join,J1_1e8_NA_5_0,c6id.4xlarge
44+
datafusion,join,J1_1e8_NA_0_1,c6id.4xlarge
45+
datafusion,join,J1_1e9_NA_0_0,c6id.4xlarge
46+
R-arrow,join,J1_1e9_NA_0_0,c6id.4xlarge
47+
dask,join,J1_1e9_NA_0_0,c6id.4xlarge
48+
datatable,join,J1_1e9_NA_0_0,c6id.4xlarge
49+
juliadf,join,J1_1e9_NA_0_0,c6id.4xlarge
50+
juliads,join,J1_1e9_NA_0_0,c6id.4xlarge
51+
pandas,join,J1_1e9_NA_0_0,c6id.4xlarge
52+
collapse,join,J1_1e9_NA_0_0,c6id.4xlarge
53+
polars,join,J1_1e9_NA_0_0,c6id.4xlarge
54+
pydatatable,join,J1_1e9_NA_0_0,c6id.4xlarge
55+
spark,join,J1_1e9_NA_0_0,c6id.4xlarge
56+
clickhouse,join,J1_1e9_NA_0_0,c6id.4xlarge
57+

_launcher/launch.R

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ if (any(is.na(dt$timeout_s))) stop("missing entries in ./_control/timeout.csv fo
4747
# detect if script has been already run before for currently installed version/revision
4848
lookup_run_batch(dt)
4949

50+
machine_type = getenv("MACHINE_TYPE")
51+
dt[,machine_type := machine_type]
52+
53+
skipped_benchmarks = fread("./_control/skipped_benchmarks.csv", logical01=TRUE, colClasses=c("character","character","character","character"))
54+
print("skipping benchmarks defined in _control/skipped_benchmarks.csv")
55+
print(skipped_benchmarks)
56+
57+
dt = dt[!skipped_benchmarks, on = c("solution", "task", "data", "machine_type")]
58+
5059
# print list of solutions that are going to be run in this batch so we know upfront which will be skipped
5160
cat("Benchmark solutions to run: ", dt[is.na(run_batch), paste(unique(solution),collapse=", ")], "\n", sep="")
5261

_launcher/setup.sh

100644100755
Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,16 @@ set -e
55
mkdir -p data
66
mkdir -p out
77

8+
sudo apt-get update
9+
810
# install R
911
sudo add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
1012
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
1113
sudo apt-get update -qq
1214
sudo apt-get install -y r-base-dev
15+
sudo apt-get install python3-dev virtualenv
16+
17+
sudo chmod a+w /usr/local/lib/R/site-library
1318

1419
# configure R
1520
echo 'LC_ALL=C' >> ~/.Renviron
@@ -22,8 +27,8 @@ Rscript -e 'install.packages(c("bit64","rmarkdown","data.table","rpivotTable","f
2227
Rscript -e 'sapply(c("bit64","rmarkdown","data.table","rpivotTable","formattable","lattice"), requireNamespace)'
2328

2429
# install duckdb for unpacking data
25-
curl --fail --location --progress-bar --output duckdb_cli-linux-amd64.zip https://github.com/duckdb/duckdb/releases/download/v1.2.0/duckdb_cli-linux-amd64.zip && unzip duckdb_cli-linux-amd64.zip
26-
sudo mv duckdb /usr/local/bin/
30+
curl --fail --location --progress-bar --output duckdb_cli-linux-amd64.zip https://github.com/duckdb/duckdb/releases/download/v1.2.0/duckdb_cli-linux-amd64.zip
31+
sudo unzip duckdb_cli-linux-amd64.zip -d /usr/local/bin
2732

2833

2934
# install aws client to download benchmark data

_run/run_large.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# download and expand large data
22

33
# get groupby large (50GB datasets)
4-
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --no-sign-request
4+
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --no-sign-request --quiet
55
# get join small (50GB datasets)
6-
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb --no-sign-request
6+
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb --no-sign-request --quiet
77

88

99
# expand groupby-large datasets to csv
@@ -31,4 +31,4 @@ echo "Running all solutions on large (50GB) datasets"
3131
echo "done..."
3232
echo "removing data files"
3333
rm data/*.csv
34-
rm data/*.duckdb
34+
rm data/*.duckdb

_run/run_small_medium.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# first download and expand small data
22

33
# get groupby small (0.5GB and 5GB datasets)
4-
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb --no-sign-request
4+
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb --no-sign-request --quiet
55
# get join small (0.5GB and 5GB datasets)
6-
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb --no-sign-request
6+
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb --no-sign-request --quiet
77

88

99
# expand groupby-small datasets to csv
Lines changed: 59 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,87 @@
11
import os
22
import csv
33
import sys
4+
import subprocess
45

56
SOLUTIONS_FILENAME = "_control/solutions.csv"
67

78

9+
INCLUDE = set()
10+
811
def install_solution(solution_name):
912
min_setup_file_name = f"./{solution_name}/min-setup-{solution_name}.sh"
1013
setup_file_name = f"./{solution_name}/setup-{solution_name}.sh"
14+
upgrade_file_name = f"./{solution_name}/upg-{solution_name}.sh"
15+
get_version_filename = f"./{solution_name}/ver-{solution_name}.sh"
1116
print(f"Installing {solution_name}")
12-
if os.path.exists(min_setup_file_name):
13-
os.system(min_setup_file_name)
14-
elif os.path.exists(setup_file_name):
15-
os.system(setup_file_name)
17+
do_install = False
18+
try:
19+
result = subprocess.call([get_version_filename], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
20+
if result != 0:
21+
do_install = True
22+
except Exception as e:
23+
do_install = True
24+
25+
if do_install:
26+
if os.path.exists(min_setup_file_name):
27+
subprocess.call([min_setup_file_name])
28+
elif os.path.exists(setup_file_name):
29+
subprocess.call([setup_file_name])
30+
else:
31+
# print(f"no script for {setup_file_name} or {min_setup_file_name}")
32+
raise Exception(f"No script to install {solution_name}")
1633
else:
17-
# print(f"no script for {setup_file_name} or {min_setup_file_name}")
18-
raise Exception(f"No script to install {solution_name}")
34+
subprocess.call([upgrade_file_name])
1935

2036
# based on the name of the solution, run the {{solution}}/min-setup-{{solution}}.sh file.
2137
# if there is no min-setup-{{solution}}.sh, then run setup-{{solution}}.sh.
2238
# if error, exit with an error
2339
# else don't
24-
def install_all_solutions():
25-
install_solutions = set()
40+
def include_all_solutions():
41+
global INCLUDE
2642
with open(SOLUTIONS_FILENAME, newline="") as solutions_file:
2743
solutions = csv.DictReader(solutions_file, delimiter=',')
2844
for row in solutions:
2945
if row['solution'] == "data.table":
30-
install_solutions.add("datatable")
46+
INCLUDE.add("datatable")
3147
else:
32-
install_solutions.add(row['solution'])
33-
for solution in install_solutions:
34-
install_solution(solution)
48+
INCLUDE.add(row['solution'])
3549

3650
if len(sys.argv) == 0:
37-
print("Usage: python3 install_all_solutions.py solution_name solution_name ...")
51+
print("""
52+
Usage: python3 install_all_solutions.py solution_name solution_name ...
53+
python3 install_all_solutions.py all --exclude clickhouse polars
54+
""")
3855
exit(1)
3956

4057
# first argument is file name
41-
for solution in sys.argv[1:]:
42-
if solution.strip() == "all":
43-
install_all_solutions()
44-
else:
45-
if solution == "data.table":
46-
install_solution("datatable")
47-
elif solution == "clickhouse":
48-
install_solution("clickhouse")
49-
install_solution("polars")
58+
59+
def main():
60+
global INCLUDE
61+
including = True
62+
for solution in sys.argv[1:]:
63+
if solution.strip() == "all":
64+
include_all_solutions()
65+
elif solution.strip() == "--exclude":
66+
including = False
67+
continue
5068
else:
51-
install_solution(solution)
52-
69+
if including:
70+
if solution == "data.table":
71+
INCLUDE.add("datatable")
72+
elif solution == "clickhouse":
73+
INCLUDE.add("clickhouse")
74+
INCLUDE.add("polars")
75+
else:
76+
INCLUDE.add(solution)
77+
else:
78+
sol = solution.strip()
79+
INCLUDE.remove(sol)
80+
81+
for solution in INCLUDE:
82+
install_solution(solution)
83+
84+
85+
if __name__ == "__main__":
86+
main()
87+

0 commit comments

Comments
 (0)