Skip to content

Commit b4ece2c

Browse files
authored
Merge pull request ClickHouse#369 from spiraldb/adamg/datafusion-vortex
Add benchmarks for querying Vortex files with Datafusion
2 parents b2bd265 + a219ee3 commit b4ece2c

File tree

5 files changed

+231
-0
lines changed

5 files changed

+231
-0
lines changed

datafusion-vortex/benchmark.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
# Install Rust
6+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
7+
bash rust-init.sh -y
8+
source ~/.cargo/env
9+
10+
# Install Dependencies
11+
sudo apt-get update
12+
sudo apt-get install --yes gcc jq build-essential
13+
14+
# Install Vortex from latest release main branch
15+
git clone https://github.com/spiraldb/vortex.git || true
16+
cd vortex
17+
git checkout 0.34.0
18+
git submodule update --init
19+
# We build a release version of the benchmarking utility using mimalloc, just like the datafusion-cli
20+
cargo build --release --bin clickbench --package bench-vortex
21+
export PATH="`pwd`/target/release:$PATH"
22+
cd ..
23+
24+
# Vortex's benchmarking utility generates appropriate Vortex files by itself, so we just run it to make sure they exist before we start measuring.
25+
# This will download parquet files (with time and string columns already converted to the logically correct datatype) and generate Vortex files from them.
26+
clickbench -i 1 --targets datafusion:vortex --display-format gh-json -q 0 --hide-progress-bar --flavor single
27+
clickbench -i 1 --targets datafusion:vortex --display-format gh-json -q 0 --hide-progress-bar --flavor partitioned
28+
29+
# Run benchmarks for single parquet and partitioned, our CLI generates the relevant vortex files.
30+
./run.sh single
31+
./run.sh partitioned
32+

datafusion-vortex/queries.sql

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
SELECT COUNT(*) FROM hits;
2+
SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
3+
SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
4+
SELECT AVG("UserID") FROM hits;
5+
SELECT COUNT(DISTINCT "UserID") FROM hits;
6+
SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
7+
SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
8+
SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
9+
SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
10+
SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
11+
SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
12+
SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
13+
SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
14+
SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
15+
SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
16+
SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
17+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
18+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
19+
SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
20+
SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
21+
SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
22+
SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
23+
SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
24+
SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
25+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
26+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
27+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
28+
SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
29+
SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
30+
SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
31+
SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
32+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
33+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
34+
SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
35+
SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
36+
SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
37+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
38+
SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
39+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
40+
SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
41+
SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
42+
SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43+
SELECT DATE_TRUNC('minute', "EventTime") AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"system": "DataFusion (Vortex, partitioned)",
3+
"date": "2024-05-20",
4+
"machine": "c6a.4xlarge, 500gb gp2",
5+
"cluster_size": 1,
6+
"comment": "v47.0.0 (e44330) - Vortex 0.34.0 (4645a2)",
7+
"tags": ["Rust", "column-oriented", "embedded", "stateless"],
8+
"load_time": 0,
9+
"data_size": 16030812776,
10+
"result": [
11+
[0.190168404, 0.032908126, 0.028567336],
12+
[0.197232935, 0.048382136, 0.046421671],
13+
[0.250624209, 0.084918948, 0.079749917],
14+
[0.893215315, 0.135351545, 0.130987004],
15+
[1.392734886, 0.823274741, 0.832587647],
16+
[1.36376189, 0.757423748, 0.758740028],
17+
[0.178384398, 0.02969999, 0.03350486],
18+
[0.218607193, 0.047277585, 0.046760947],
19+
[1.70476231, 0.878636825, 0.872227002],
20+
[2.082373223, 0.991509975, 1.05086616],
21+
[0.98410887, 0.17026521, 0.173454965],
22+
[1.083189592, 0.182121674, 0.174995889],
23+
[1.503440978, 0.654882383, 0.670131187],
24+
[3.105944537, 1.233938102, 1.242377475],
25+
[1.583146849, 0.647639599, 0.648628942],
26+
[1.309980901, 0.935074889, 0.951897796],
27+
[3.116824776, 1.766182201, 1.75505341],
28+
[3.13095758, 1.704609978, 1.696661839],
29+
[4.660634808, 3.324680758, 3.324063243],
30+
[0.441579681, 0.080097425, 0.070227726],
31+
[13.725473472, 0.601948982, 0.598374177],
32+
[14.689527359, 0.709040556, 0.738804711],
33+
[17.461011602, 1.400376267, 1.400084216],
34+
[48.858555559, 3.81009392, 3.737116171],
35+
[2.134026036, 0.257863211, 0.266044164],
36+
[1.194510333, 0.208411903, 0.205209087],
37+
[2.113798613, 0.305284949, 0.317681204],
38+
[13.562791159, 1.229502358, 1.208038414],
39+
[12.832634507, 10.185001058, 9.429494206],
40+
[0.609869365, 0.429862613, 0.428073911],
41+
[2.661715231, 0.532374626, 0.539352259],
42+
[5.77896413, 0.625172027, 0.625849387],
43+
[4.489662152, 3.15151116, 3.173024888],
44+
[13.80897203, 3.301920755, 3.315570988],
45+
[13.686335224, 3.333205431, 3.330790474],
46+
[1.282416302, 1.113708109, 1.07930895],
47+
[0.289388594, 0.130178919, 0.139454432],
48+
[0.240974374, 0.070231053, 0.072651871],
49+
[0.232028246, 0.063181124, 0.061856407],
50+
[0.396877605, 0.236549169, 0.241319459],
51+
[0.224633554, 0.046571383, 0.048503385],
52+
[0.214310274, 0.050147026, 0.048456502],
53+
[0.234745087, 0.064590288, 0.063707943]
54+
]
55+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"system": "DataFusion (Vortex, single)",
3+
"date": "2024-05-20",
4+
"machine": "c6a.4xlarge, 500gb gp2",
5+
"cluster_size": 1,
6+
"comment": "v47.0.0 (e44330) - Vortex 0.34.0 (4645a2)",
7+
"tags": ["Rust", "column-oriented", "embedded", "stateless"],
8+
"load_time": 0,
9+
"data_size": 17215640128,
10+
"result": [
11+
[0.141480758, 0.012545583, 0.012392761],
12+
[0.181382115, 0.026653701, 0.026936212],
13+
[0.282474733, 0.078885164, 0.078039942],
14+
[0.679924914, 0.11919166, 0.116286645],
15+
[1.054943476, 0.8040958, 0.832445355],
16+
[1.090415595, 0.829868936, 0.829622018],
17+
[0.170879192, 0.012159045, 0.01223504],
18+
[0.23391203, 0.029247912, 0.030202152],
19+
[1.654670549, 0.855568544, 0.86024496],
20+
[2.391636659, 1.027901922, 1.053496926],
21+
[0.766333381, 0.156641813, 0.152039054],
22+
[0.827249522, 0.17317988, 0.179940576],
23+
[1.446704309, 0.71628753, 0.716993912],
24+
[2.80230849, 1.092804952, 1.113480373],
25+
[1.519657863, 0.693184573, 0.699690051],
26+
[1.140707586, 0.986270372, 0.94835135],
27+
[2.530029518, 1.768219286, 1.804834302],
28+
[2.538327684, 1.704263592, 1.736171757],
29+
[4.179530481, 3.332336459, 3.364090216],
30+
[0.360142932, 0.047734529, 0.045213023],
31+
[13.269683492, 0.53511803, 0.53864554],
32+
[14.513555909, 0.673683725, 0.691400197],
33+
[17.940377894, 1.410351611, 1.443679458],
34+
[53.435175298, 4.641778521, 4.664277646],
35+
[1.941012081, 0.273817491, 0.272369074],
36+
[0.902833605, 0.201341035, 0.224143354],
37+
[2.005485132, 0.356166405, 0.358523637],
38+
[14.189616267, 1.461826632, 1.429376903],
39+
[21.828070141, 11.463846327, 9.883968208],
40+
[0.584918354, 0.386498574, 0.376434759],
41+
[2.731601722, 0.564045029, 0.568457696],
42+
[5.812685218, 0.601464498, 0.625651876],
43+
[4.385733109, 3.174433554, 3.210718986],
44+
[13.439658252, 3.307697903, 3.339940759],
45+
[13.347894058, 3.555926041, 3.365055796],
46+
[1.233341691, 1.015625136, 1.016971005],
47+
[0.344006452, 0.109427194, 0.118646414],
48+
[0.28388004, 0.057840973, 0.066544644],
49+
[0.283143326, 0.068711564, 0.068289703],
50+
[0.462529826, 0.256544726, 0.259010007],
51+
[0.268563731, 0.033184136, 0.033833914],
52+
[0.259093093, 0.029939417, 0.029739711],
53+
[0.270240275, 0.047861387, 0.046710322]
54+
]
55+
}

datafusion-vortex/run.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash
2+
3+
# Check if an argument is provided
4+
if [ "$#" -ne 1 ]; then
5+
echo "Usage: $0 [single|partitioned]"
6+
exit 1
7+
fi
8+
9+
# Set the SQL file based on the argument
10+
if [ "$1" == "single" ] || [ "$1" == "partitioned" ]; then
11+
FLAVOR=$1
12+
echo "Running benchmark for $FLAVOR"
13+
else
14+
echo "Invalid argument. Please use 'single' or 'partitioned'."
15+
exit 1
16+
fi
17+
18+
# clear results file
19+
touch results.csv
20+
> results.csv
21+
22+
TRIES=3
23+
OS=$(uname -s)
24+
25+
for query_num in $(seq 0 42); do
26+
sync
27+
28+
if [ "$OS" = "Linux" ]; then
29+
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
30+
elif [ "$OS" = "Darwin" ]; then
31+
sudo purge
32+
fi
33+
34+
echo -n "["
35+
for i in $(seq 1 $TRIES); do
36+
# Parse query results out of the JSON output, which reports the time in ns
37+
RES=$(RUST_LOG=off clickbench -i 1 --flavor $FLAVOR --targets datafusion:vortex --display-format gh-json --queries-file ./queries.sql -q $query_num --hide-progress-bar | jq ".value / 1000000000")
38+
39+
[[ $RES != "" ]] && \
40+
echo -n "$RES" || \
41+
echo -n "null"
42+
[[ "$i" != $TRIES ]] && echo -n ", "
43+
echo "${query_num},${i},${RES}" >> results.csv
44+
done
45+
echo "],"
46+
done

0 commit comments

Comments
 (0)