duckdblabs
diff --git a/‎.github/workflows/RunBenchmark.yml‎
Lines changed: 144 additions & 0 deletions b/‎.github/workflows/RunBenchmark.yml‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎_control/skipped_benchmarks.csv‎
Lines changed: 57 additions & 0 deletions b/‎_control/skipped_benchmarks.csv‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎_launcher/launch.R‎
Lines changed: 9 additions & 0 deletions b/‎_launcher/launch.R‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎_launcher/setup.sh‎
100644100755
Lines changed: 7 additions & 2 deletions b/‎_launcher/setup.sh‎
100644100755
Lines changed: 7 additions & 2 deletions
diff --git a/‎_run/run_large.sh‎
Lines changed: 3 additions & 3 deletions b/‎_run/run_large.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎_run/run_small_medium.sh‎
Lines changed: 2 additions & 2 deletions b/‎_run/run_small_medium.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎_setup_utils/install_all_solutions.py‎
Lines changed: 59 additions & 24 deletions b/‎_setup_utils/install_all_solutions.py‎
Lines changed: 59 additions & 24 deletions
@@ -0,0 +1,144 @@
+name: Run benchmark
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/master' || github.sha }}
+  cancel-in-progress: true
+
+env:
+  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  gh_issue_repo: duckdblabs/db-benchmark
+  instance_id: i-0e75a4f0e0b61c475
+
+  start-aws-machine:
+    name: Start aws-small-machine
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Start EC2 runner
+        shell: bash
+        env:
+          AWS_ACCESS_KEY_ID: ${{secrets.AWS_ACCESS_KEY_ID}}
+          AWS_SECRET_ACCESS_KEY: ${{secrets.AWS_SECRET_ACCESS_KEY}}
+          AWS_DEFAULT_REGION: us-east-1
+        run: aws ec2 start-instances --instance-id ${{ env.instance_id }}
+
+      - name: Create issue if failure
+        shell: bash
+        if: ${{ failure() && contains(github.ref_name, 'main') }}
+        run: |
+          gh issue create --repo ${{ env.gh_issue_repo }} --title "Weekly Regression Test Failure" --body "AWS box with instance-id ${{ env.instance_id }} could not be started"
+
+jobs:
+  run-benchmark:
+    name: Regression Tests all solutions
+    env:
+      CC: gcc-10
+      CXX: g++-10
+      GEN: ninja
+    runs-on: self-hosted
+    permissions:                # Job-level permissions configuration starts here
+      contents: write           # 'write' access to repository contents
+      pull-requests: write      # 'write' access to pull requests
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: add_trigger_to_run_the_benchmark
+          persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token.
+
+      - name: run mount
+        shell: bash
+        run: |
+          ./_setup_utils/mount.sh
+
+      # TODO: Remove this.
+      - name: checkout correct branch
+        shell: bash
+        working-directory: /var/lib/mount/db-benchmark-metal
+        run: |
+          git checkout add_trigger_to_run_the_benchmark
+
+      - name: Install or Upgrade all solutions
+        shell: bash
+        working-directory: /var/lib/mount/db-benchmark-metal
+        run: |
+          python3 _setup_utils/install_all_solutions.py all --exclude clickhouse
+          # installing/updating clickhouse needs sudo priviledges
+          sudo python3 _setup_utils/install_all_solutions.py clickhouse
+
+      - name: Modify run.conf to only have new versions
+        shell: bash
+        working-directory: /var/lib/mount/db-benchmark-metal
+        run: |
+          git diff --name-only **/VERSION > updated_solutions.txt
+          sed -i 's/\/VERSION/ /g' updated_solutions.txt
+          tr --delete '\n' < updated_solutions.txt  > updated_solutions_no_newlines.txt
+          export new_solutions=`cat updated_solutions_no_newlines.txt`
+          echo "New benchmarks are " $new_solutions
+          sed -i "s/export RUN_SOLUTIONS=.*/export RUN_SOLUTIONS=\"${new_solutions}\"/g" run.conf
+
+      - name: Run the benchmark
+        shell: bash
+        working-directory: /var/lib/mount/db-benchmark-metal
+        run: |
+          ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
+          if [ $ncores -eq 16 ]; then export MACHINE_TYPE="c6id.4xlarge"; fi
+          if [ $ncores -eq 32 ]; then export MACHINE_TYPE="c6id.8xlarge"; fi
+          if [ $ncores -eq 128 ]; then export MACHINE_TYPE="c6id.metal"; fi
+          MACHINE_TYPE=$MACHINE_TYPE ./_run/run_small_medium.sh
+          
+      - name: name new branch
+        shell: bash
+        run: |
+          echo "new_branch_name=results-`date +%Y-%m-%d-%Hh%Mm`" >> $GITHUB_ENV
+          echo ${{ env.new_branch_name }}
+
+      - name: Commit updates
+        shell: bash
+        working-directory: /var/lib/mount/db-benchmark-metal
+        run: |
+          git config --global user.email ""
+          git config --global user.name "Run Benchmark action"
+          git remote add upstream git@github.com:duckdblabs/db-benchmark
+          git fetch upstream
+          git switch -c ${{ env.new_branch_name }}
+          git add time.csv logs.csv **/VERSION
+          git add run.conf
+          git commit -m "new results"
+          git push upstream ${{ env.new_branch_name }}
+
+      - name: Create Archive
+        if: always()
+        shell: bash
+        working-directory: /var/lib/mount/db-benchmark-metal
+        run: |
+          mkdir -p out
+          echo "guarantee not empty dir" > out/guarantee.txt
+          zip -r out-dir.zip out/
+
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: out-dir.zip
+          path: /var/lib/mount/db-benchmark-metal/out-dir.zip
+          if-no-files-found: error
+
+  shutdown:
+    name: shut down
+    if: always()
+    runs-on: ubuntu-latest
+    needs:
+      - start-runner
+      - run-benchmark
+
+    steps:
+      - name: shutdown
+        shell: bash
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_DEFAULT_REGION: us-east-1
+        run: aws ec2 stop-instances --instance-id ${{ env.instance_id }}
+
@@ -0,0 +1,57 @@
+solution,task,data,machine_type
+juliads,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
+juliads,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
+juliads,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
+juliads,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
+juliads,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
+juliadf,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
+juliadf,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
+juliadf,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
+juliadf,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
+juliadf,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
+R-arrow,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
+R-arrow,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
+R-arrow,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
+R-arrow,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
+R-arrow,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
+dplyr,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
+dplyr,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
+dplyr,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
+dplyr,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
+dplyr,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
+pandas,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
+pandas,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
+pandas,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
+pandas,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
+pandas,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
+pydatatable,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
+pydatatable,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
+pydatatable,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
+pydatatable,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
+pydatatable,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
+spark,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
+spark,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
+spark,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
+spark,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
+spark,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
+datafusion,groupby,G1_1e9_1e2_0_0,c6id.4xlarge
+datafusion,groupby,G1_1e9_1e1_0_0,c6id.4xlarge
+datafusion,groupby,G1_1e9_2e0_0_0,c6id.4xlarge
+datafusion,groupby,G1_1e9_1e2_0_1,c6id.4xlarge
+datafusion,groupby,G1_1e9_1e2_5_0,c6id.4xlarge
+datafusion,join,J1_1e8_NA_0_0,c6id.4xlarge
+datafusion,join,J1_1e8_NA_5_0,c6id.4xlarge
+datafusion,join,J1_1e8_NA_0_1,c6id.4xlarge
+datafusion,join,J1_1e9_NA_0_0,c6id.4xlarge
+R-arrow,join,J1_1e9_NA_0_0,c6id.4xlarge
+dask,join,J1_1e9_NA_0_0,c6id.4xlarge
+datatable,join,J1_1e9_NA_0_0,c6id.4xlarge
+juliadf,join,J1_1e9_NA_0_0,c6id.4xlarge
+juliads,join,J1_1e9_NA_0_0,c6id.4xlarge
+pandas,join,J1_1e9_NA_0_0,c6id.4xlarge
+collapse,join,J1_1e9_NA_0_0,c6id.4xlarge
+polars,join,J1_1e9_NA_0_0,c6id.4xlarge
+pydatatable,join,J1_1e9_NA_0_0,c6id.4xlarge
+spark,join,J1_1e9_NA_0_0,c6id.4xlarge
+clickhouse,join,J1_1e9_NA_0_0,c6id.4xlarge
+
@@ -47,6 +47,15 @@ if (any(is.na(dt$timeout_s))) stop("missing entries in ./_control/timeout.csv fo
 # detect if script has been already run before for currently installed version/revision
 lookup_run_batch(dt)
 
+machine_type = getenv("MACHINE_TYPE")
+dt[,machine_type := machine_type]
+
+skipped_benchmarks = fread("./_control/skipped_benchmarks.csv", logical01=TRUE, colClasses=c("character","character","character","character"))
+print("skipping benchmarks defined in _control/skipped_benchmarks.csv")
+print(skipped_benchmarks)
+
+dt = dt[!skipped_benchmarks, on = c("solution", "task", "data", "machine_type")]
+
 # print list of solutions that are going to be run in this batch so we know upfront which will be skipped
 cat("Benchmark solutions to run: ", dt[is.na(run_batch), paste(unique(solution),collapse=", ")], "\n", sep="")
 
 
@@ -5,11 +5,16 @@ set -e
 mkdir -p data
 mkdir -p out
 
+sudo apt-get update
+
 # install R
 sudo add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
 sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
 sudo apt-get update -qq
 sudo apt-get install -y r-base-dev
+sudo apt-get install python3-dev virtualenv
+
+sudo chmod a+w /usr/local/lib/R/site-library
 
 # configure R
 echo 'LC_ALL=C' >> ~/.Renviron
@@ -22,8 +27,8 @@ Rscript -e 'install.packages(c("bit64","rmarkdown","data.table","rpivotTable","f
 Rscript -e 'sapply(c("bit64","rmarkdown","data.table","rpivotTable","formattable","lattice"), requireNamespace)'
 
 # install duckdb for unpacking data
-curl --fail --location --progress-bar --output duckdb_cli-linux-amd64.zip https://github.com/duckdb/duckdb/releases/download/v1.2.0/duckdb_cli-linux-amd64.zip && unzip duckdb_cli-linux-amd64.zip
-sudo mv duckdb /usr/local/bin/
+curl --fail --location --progress-bar --output duckdb_cli-linux-amd64.zip https://github.com/duckdb/duckdb/releases/download/v1.2.0/duckdb_cli-linux-amd64.zip
+sudo unzip duckdb_cli-linux-amd64.zip -d /usr/local/bin
 
 
 # install aws client to download benchmark data
 
@@ -1,9 +1,9 @@
 # download and expand large data
 
 # get groupby large (50GB datasets)
-aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --no-sign-request
+aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb --no-sign-request --quiet
 # get join small (50GB datasets)
-aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb --no-sign-request
+aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb --no-sign-request --quiet
 
 
 # expand groupby-large datasets to csv
@@ -31,4 +31,4 @@ echo "Running all solutions on large (50GB) datasets"
 echo "done..."
 echo "removing data files"
 rm data/*.csv
-rm data/*.duckdb
+rm data/*.duckdb
@@ -1,9 +1,9 @@
 # first download and expand small data
 
 # get groupby small (0.5GB and 5GB datasets)
-aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb --no-sign-request
+aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb --no-sign-request --quiet
 # get join small (0.5GB and 5GB datasets)
-aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb --no-sign-request
+aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb --no-sign-request --quiet
 
 
 # expand groupby-small datasets to csv
 
@@ -1,52 +1,87 @@
 import os
 import csv
 import sys
+import subprocess
 
 SOLUTIONS_FILENAME = "_control/solutions.csv"
 
 
+INCLUDE = set()
+
 def install_solution(solution_name):
     min_setup_file_name = f"./{solution_name}/min-setup-{solution_name}.sh"
     setup_file_name = f"./{solution_name}/setup-{solution_name}.sh"
+    upgrade_file_name = f"./{solution_name}/upg-{solution_name}.sh"
+    get_version_filename = f"./{solution_name}/ver-{solution_name}.sh"
     print(f"Installing {solution_name}")
-    if os.path.exists(min_setup_file_name):
-        os.system(min_setup_file_name)
-    elif os.path.exists(setup_file_name):
-        os.system(setup_file_name)
+    do_install = False
+    try:
+        result = subprocess.call([get_version_filename], stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
+        if result != 0:
+            do_install = True
+    except Exception as e:
+        do_install = True
+
+    if do_install:
+        if os.path.exists(min_setup_file_name):
+            subprocess.call([min_setup_file_name])
+        elif os.path.exists(setup_file_name):
+            subprocess.call([setup_file_name])
+        else:
+            # print(f"no script for {setup_file_name} or {min_setup_file_name}")
+            raise Exception(f"No script to install {solution_name}")
     else:
-        # print(f"no script for {setup_file_name} or {min_setup_file_name}")
-        raise Exception(f"No script to install {solution_name}")
+        subprocess.call([upgrade_file_name])
 
 # based on the name of the solution, run the {{solution}}/min-setup-{{solution}}.sh file.
 # if there is no min-setup-{{solution}}.sh, then run setup-{{solution}}.sh.
 # if error, exit with an error
 # else don't
-def install_all_solutions():
-    install_solutions = set()
+def include_all_solutions():
+    global INCLUDE
     with open(SOLUTIONS_FILENAME, newline="") as solutions_file:
         solutions = csv.DictReader(solutions_file, delimiter=',')
         for row in solutions:
             if row['solution'] == "data.table":
-                install_solutions.add("datatable")
+                INCLUDE.add("datatable")
             else:
-                install_solutions.add(row['solution'])
-    for solution in install_solutions:
-        install_solution(solution)
+                INCLUDE.add(row['solution'])
 
 if len(sys.argv) == 0:
-    print("Usage: python3 install_all_solutions.py solution_name solution_name ...")
+    print("""
+Usage: python3 install_all_solutions.py solution_name solution_name ...
+       python3 install_all_solutions.py all --exclude clickhouse polars
+""")
     exit(1)
 
 # first argument is file name
-for solution in sys.argv[1:]:
-    if solution.strip() == "all":
-        install_all_solutions()
-    else:
-        if solution == "data.table":
-            install_solution("datatable")
-        elif solution == "clickhouse":
-            install_solution("clickhouse")
-            install_solution("polars")
+
+def main():
+    global INCLUDE
+    including = True
+    for solution in sys.argv[1:]:
+        if solution.strip() == "all":
+            include_all_solutions()
+        elif solution.strip() == "--exclude":
+            including = False
+            continue
         else:
-            install_solution(solution)
-        
+            if including:
+                if solution == "data.table":
+                    INCLUDE.add("datatable")
+                elif solution == "clickhouse":
+                    INCLUDE.add("clickhouse")
+                    INCLUDE.add("polars")
+                else:
+                    INCLUDE.add(solution)
+            else:
+                sol = solution.strip()
+                INCLUDE.remove(sol)
+
+    for solution in INCLUDE:
+        install_solution(solution)
+
+
+if __name__ == "__main__":
+    main()
+