Merge pull request ClickHouse#363 from GlareDB/sean/update-glaredb

rschu1ze · web-flow · commit 3b09ce47ee00 · 2025-05-09T18:13:45.000+02:00
diff --git a/glaredb/.gitignore b/glaredb/.gitignore
@@ -0,0 +1,9 @@
+# The binary
+/glaredb
+
+# Any of the hits files
+/data
+
+# Result data, should be manually copied to the right spot if being kept.
+results.csv
+results.json
diff --git a/glaredb/benchmark.sh b/glaredb/benchmark.sh
@@ -1,20 +1,42 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
-# Install
+set -e
 
-sudo apt-get install -y unzip
-curl https://glaredb.com/install.sh | sh
+repo_root=$(git rev-parse --show-toplevel)
+script_dir=$(dirname "$0")
 
-wget https://clickhouse-public-datasets.s3.eu-central-1.amazonaws.com/hits_compatible/athena/hits.parquet
+if [[ "$(basename "$repo_root")" == "glaredb" ]]; then
+    # Inside glaredb repo, build from source.
+    cargo build --release --bin glaredb
+    cp "${repo_root}/target/release/glaredb" "${script_dir}/glaredb"
+else
+    # Not in glaredb repo, use prebuilt binary.
+    export GLAREDB_INSTALL_DIR="${script_dir}"
+    export GLAREDB_VERSION="v25.5.2"
+    curl -fsSL https://glaredb.com/install.sh | sh
+fi
 
-cat queries.sql | while read -r query
-do
-    sync
-    echo 3 | sudo tee /proc/sys/vm/drop_caches
+# Get the data.
+mkdir -p "${script_dir}/data"
+pushd "${script_dir}/data"
 
-    for i in $(seq 1 3); do
-        ./glaredb --timing --query "${query}"
-    done;
-done 2>&1 | tee log.txt
+mode="${1:-single}" # Default to 'single' if no arg given.
+case "${mode}" in
+    single)
+        wget --continue https://clickhouse-public-datasets.s3.eu-central-1.amazonaws.com/hits_compatible/athena/hits.parquet
+        ;;
+    partitioned)
+        seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
+        ;;
+    *)
+        echo "Invalid argument to 'benchmark.sh', expected 'single' or 'partitioned'"
+        exit 1
+        ;;
+esac
+popd
 
-cat log.txt | grep -oP 'Time: \d+\.\d+s|Error' | sed -r -e 's/Time: ([0-9]+\.[0-9]+)s/\1/; s/Error/null/' | awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }'
+# Ensure working directory is the script dir. The view that gets created uses a
+# relative path.
+pushd "${script_dir}"
+
+./run.sh "${mode}"
diff --git a/glaredb/create_single.sql b/glaredb/create_single.sql
@@ -0,0 +1,3 @@
+CREATE TEMP VIEW hits AS
+  SELECT * REPLACE (EventDate::DATE AS EventDate)
+    FROM read_parquet('./data/hits.parquet');
diff --git a/glaredb/queries.sql b/glaredb/queries.sql
diff --git a/glaredb/results/c6a.4xlarge.json b/glaredb/results/c6a.4xlarge.json
@@ -1,55 +1,55 @@
 {
-    "system": "GlareDB",
-    "date": "2024-02-02",
+    "system": "GlareDB (Parquet, single)",
+    "date": "2025-05-06",
     "machine": "c6a.4xlarge, 500gb gp2",
     "cluster_size": 1,
     "comment": "",
     "tags": ["Rust", "serverless"],
     "load_time": 0,
     "data_size": 14779976446,
     "result": [
-[0.221,0.070,0.069],
-[0.509,0.425,0.424],
-[0.569,0.462,0.466],
-[0.787,0.483,0.470],
-[1.339,1.220,1.200],
-[1.800,1.644,1.649],
-[0.490,0.418,0.410],
-[0.509,0.423,0.423],
-[2.009,1.880,1.920],
-[3.383,3.134,3.147],
-[0.950,0.695,0.702],
-[0.891,0.743,0.721],
-[1.787,1.646,1.654],
-[4.032,3.007,2.974],
-[1.911,1.746,1.762],
-[1.538,1.321,1.360],
-[3.746,3.335,3.327],
-[3.611,3.064,2.993],
-[7.835,6.293,6.414],
-[0.882,0.469,0.470],
-[9.903,1.943,1.947],
-[11.653,2.182,2.172],
-[22.503,4.410,4.416],
-[56.481,11.754,11.769],
-[3.039,0.925,0.917],
-[1.132,0.854,0.855],
-[2.939,0.991,0.973],
-[9.958,2.688,2.695],
-[9.431,5.639,5.614],
-[1.027,0.872,0.814],
-[2.611,1.508,1.497],
-[6.177,1.887,1.960],
-[9.675,9.095,8.891],
-[12.268,7.139,7.063],
-[12.675,7.661,7.671],
-[2.418,2.250,2.210],
-[9.998,2.095,2.066],
-[9.273,2.782,2.722],
-[10.015,2.085,2.079],
-[18.876,3.284,3.317],
-[2.963,0.939,0.917],
-[2.165,0.973,0.936],
-[1.380,0.901,0.864]
-    ]
+    [0.044,0.038,0.037],
+    [0.147,0.131,0.134],
+    [0.231,0.207,0.204],
+    [0.311,0.160,0.152],
+    [1.064,1.010,1.005],
+    [1.072,1.008,1.038],
+    [0.127,0.111,0.111],
+    [0.159,0.139,0.146],
+    [1.583,1.512,1.480],
+    [1.919,1.839,1.827],
+    [0.598,0.535,0.528],
+    [0.680,0.614,0.616],
+    [1.105,1.013,0.995],
+    [2.921,2.089,2.053],
+    [1.228,1.109,1.106],
+    [1.382,1.349,1.330],
+    [2.997,2.447,2.431],
+    [2.707,2.086,2.092],
+    [5.820,4.520,4.625],
+    [0.246,0.193,0.193],
+    [9.619,1.751,1.761],
+    [11.123,1.493,1.476],
+    [21.896,3.584,3.538],
+    [55.927,14.498,14.510],
+    [2.526,0.884,0.873],
+    [1.022,0.974,0.964],
+    [2.544,1.081,1.077],
+    [9.457,1.559,1.503],
+    [11.226,12.444,12.082],
+    [4.675,4.722,4.769],
+    [2.161,1.257,1.251],
+    [5.866,1.583,1.582],
+    [7.481,6.244,6.247],
+    [10.969,3.891,3.921],
+    [11.092,4.200,4.157],
+    [1.472,1.433,1.446],
+    [0.243,0.191,0.191],
+    [0.180,0.154,0.153],
+    [0.180,0.127,0.124],
+    [0.444,0.348,0.343],
+    [0.102,0.075,0.075],
+    [0.089,0.069,0.074],
+    [0.094,0.079,0.079]
+]
 }
diff --git a/glaredb/results/c6a.metal.json b/glaredb/results/c6a.metal.json
diff --git a/glaredb/run.sh b/glaredb/run.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+set -eu
+set -o pipefail
+
+case "$1" in
+    single)
+        create_sql_file="create_single.sql"
+        ;;
+    *)
+        echo "Invalid argument to 'run.sh', expected 'single'"
+        exit 1
+        ;;
+esac
+
+TRIES=3
+QUERY_NUM=0
+
+echo "[" > results.json
+echo "query_num,iteration,duration" > results.csv
+
+cat queries.sql | while read -r query; do
+    sync
+    echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null
+
+    echo "${QUERY_NUM}: ${query}"
+
+    [ "${QUERY_NUM}" != 0 ] && echo "," >> results.json
+    echo -n "    [" >> results.json
+
+    for i in $(seq 1 $TRIES); do
+        output=$(./glaredb --init "${create_sql_file}" -c ".timer on" -c "${query}")
+        duration=$(awk -F': ' '/^Execution duration/ { printf "%.3f\n", $2 }' <<< "$output")
+
+        echo "$output"
+
+        if [ -z "${duration}" ]; then
+           echo "Query failed"
+           exit 1
+        fi
+
+        # JSON results
+        echo -n "${duration}" >> results.json
+        [ "${i}" != "${TRIES}" ] && echo -n "," >> results.json
+
+        # CSV results
+        echo "${QUERY_NUM},${i},${duration}" >> results.csv
+    done
+
+    echo -n "]" >> results.json
+
+    QUERY_NUM=$((QUERY_NUM + 1))
+done
+
+echo "" >> results.csv
+echo "" >> results.json
+echo "]" >> results.json

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+CREATE TEMP VIEW hits AS`
	`2`	`+ SELECT * REPLACE (EventDate::DATE AS EventDate)`
	`3`	`+ FROM read_parquet('./data/hits.parquet');`