Skip to content

Commit e4cc95b

Browse files
committed
Add benchmarks for querying Vortex files with datafusion
1 parent 177204c commit e4cc95b

File tree

6 files changed

+232
-1
lines changed

6 files changed

+232
-1
lines changed

datafusion-vortex/benchmark.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
# Install Rust
6+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
7+
bash rust-init.sh -y
8+
source ~/.cargo/env
9+
10+
# Install Dependencies
11+
sudo apt-get update
12+
sudo apt-get install --yes gcc jq build-essential
13+
14+
# Install Vortex from latest release main branch
15+
git clone https://github.com/spiraldb/vortex.git
16+
cd vortex
17+
git checkout 0.29.0
18+
git submodule update --init
19+
# We build a release version of the benchmarking utility using mimalloc, just like the datafusion-cli
20+
cargo build --release --bin clickbench --package bench-vortex --features mimalloc
21+
export PATH="`pwd`/target/release:$PATH"
22+
cd ..
23+
24+
# Vortex's benchmarking utility generates appropriate Vortex files by itself, so we just run it to make sure they exist before we start measuring.
25+
# This will download parquet files (with time and string columns already converted to the logically correct datatype) and generate Vortex files from them.
26+
clickbench -i 1 --flavor single --formats vortex --display-format gh-json -q 0 --hide-progress-bar --hide-metrics
27+
clickbench -i 1 --flavor partitioned --formats vortex --display-format gh-json -q 0 --hide-progress-bar --hide-metrics
28+
29+
# Run benchmarks for single parquet and partitioned, our CLI generates the relevant vortex files.
30+
./run.sh single
31+
./run.sh partitioned
32+

datafusion-vortex/queries.sql

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
SELECT COUNT(*) FROM hits;
2+
SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
3+
SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
4+
SELECT AVG("UserID") FROM hits;
5+
SELECT COUNT(DISTINCT "UserID") FROM hits;
6+
SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
7+
SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
8+
SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
9+
SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
10+
SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
11+
SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
12+
SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
13+
SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
14+
SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
15+
SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
16+
SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
17+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
18+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
19+
SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
20+
SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
21+
SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
22+
SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
23+
SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
24+
SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
25+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
26+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
27+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
28+
SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
29+
SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
30+
SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
31+
SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
32+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
33+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
34+
SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
35+
SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
36+
SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
37+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
38+
SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
39+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
40+
SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
41+
SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
42+
SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43+
SELECT DATE_TRUNC('minute', "EventTime") AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
{
2+
"system": "DataFusion (Vortex, partitioned)",
3+
"date": "2024-04-17",
4+
"machine": "c6a.4xlarge, 500gb gp2",
5+
"cluster_size": 1,
6+
"comment": "v46.0.0 (26058ac) - Vortex 0.28",
7+
"tags": [
8+
"Rust",
9+
"column-oriented",
10+
"embedded",
11+
"stateless"
12+
],
13+
"load_time": 0,
14+
"data_size": 17429875344,
15+
"result": [
16+
[0.209022893, 0.045973819, 0.047847208],
17+
[0.249353289, 0.068623551, 0.063561421],
18+
[0.290376842, 0.101421863, 0.102222783],
19+
[1.224526121, 0.175672048, 0.175667187],
20+
[1.614875461, 0.887772573, 0.879480369],
21+
[1.242317063, 0.8545056, 0.847013568],
22+
[0.197028082, 0.045456615, 0.042790467],
23+
[0.250948665, 0.068804064, 0.068127307],
24+
[2.021128572, 1.074058028, 1.053019429],
25+
[2.399406362, 1.065030067, 1.00739901],
26+
[1.368950062, 0.197224019, 0.188145505],
27+
[1.398013148, 0.219433768, 0.215830685],
28+
[1.206921316, 0.698591978, 0.691705491],
29+
[3.318140248, 1.23003645, 1.135895665],
30+
[1.338126092, 0.672885689, 0.687993987],
31+
[1.658468508, 1.025608376, 1.018830423],
32+
[3.540013338, 2.03828511, 2.011581315],
33+
[3.501757931, 1.959953371, 2.037370228],
34+
[4.652143676, 3.389895802, 3.325770566],
35+
[0.811237373, 0.100190888, 0.09829752],
36+
[12.981442541, 0.684403273, 0.681713323],
37+
[13.939847319, 0.790628795, 0.786868298],
38+
[19.871398231, 1.21411658, 1.242722873],
39+
[52.313298048, 2.442885128, 2.521066225],
40+
[2.014329774, 0.280991385, 0.27804144],
41+
[1.107123139, 0.254632539, 0.262339687],
42+
[2.009289742, 0.326275506, 0.350523108],
43+
[12.825387588, 1.255929572, 1.233104088],
44+
[10.877038149, 8.720785768, 8.915458546],
45+
[0.782900126, 0.533128121, 0.520370413],
46+
[2.777874371, 0.59002042, 0.589834364],
47+
[5.93977676, 0.693196166, 0.68480073],
48+
[4.660134019, 3.71664791, 3.652609356],
49+
[13.124461103, 3.541201064, 3.543533407],
50+
[13.102851482, 3.562170113, 3.556527399],
51+
[1.67530715, 1.432144949, 1.446707141],
52+
[0.317277409, 0.125771727, 0.120007768],
53+
[0.271492014, 0.080272309, 0.078311859],
54+
[0.275219599, 0.07759221, 0.080484985],
55+
[0.409570806, 0.204722892, 0.2035123],
56+
[0.300740655, 0.064404142, 0.064988057],
57+
[0.289511607, 0.063690134, 0.070040216],
58+
[0.296861655, 0.079109189, 0.070022097]
59+
]
60+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"system": "DataFusion (Vortex, single)",
3+
"date": "2024-04-17",
4+
"machine": "c6a.4xlarge, 500gb gp2",
5+
"cluster_size": 1,
6+
"comment": "v45.0.0 (26058ac) - Vortex 0.28",
7+
"tags": ["Rust","column-oriented","embedded","stateless"],
8+
"load_time": 0,
9+
"data_size": 21179522704,
10+
"result": [
11+
[0.174364204, 0.010440409, 0.010370387],
12+
[0.244388159, 0.042588558, 0.039147909],
13+
[0.467693154, 0.076654725, 0.075430334],
14+
[2.531539754, 0.17338202, 0.174049527],
15+
[2.932393061, 0.856704197, 0.847729667],
16+
[1.308577645, 0.852965281, 0.843271332],
17+
[0.164744389, 0.010295775, 0.010185931],
18+
[0.247191084, 0.047116419, 0.044966464],
19+
[3.759897698, 1.10256083, 1.141110314],
20+
[4.250928774, 1.011239299, 0.996966773],
21+
[2.639029206, 0.183928751, 0.184595799],
22+
[2.687018168, 0.201629756, 0.205098735],
23+
[1.370296469, 0.715426702, 0.734013429],
24+
[4.980540738, 1.13228284, 1.132070276],
25+
[1.607691502, 0.726409284, 0.70328094],
26+
[2.989712592, 1.043465724, 1.031278301],
27+
[5.242456957, 2.113202203, 2.081874208],
28+
[5.180410022, 2.075351767, 2.060650738],
29+
[7.180446451, 3.692168189, 3.672579635],
30+
[1.514124778, 0.088696083, 0.090162821],
31+
[13.791571987, 0.815965445, 0.779773877],
32+
[14.789181517, 0.976388271, 0.955792003],
33+
[15.620386686, 1.284330641, 1.282259827],
34+
[63.750296494, 2.9712611, 3.001768421],
35+
[2.213526768, 0.323827536, 0.322014919],
36+
[1.081892696, 0.54722437, 0.550836332],
37+
[2.207566161, 0.621586004, 0.623680947],
38+
[13.555193069, 1.330019661, 1.336007475],
39+
[10.908955818, 8.35283854, 8.474634992],
40+
[0.766130673, 0.470555176, 0.467622551],
41+
[3.849851625, 0.589227362, 0.584059289],
42+
[6.741495027, 0.655655397, 0.64509176],
43+
[5.117222368, 3.695131107, 3.770641357],
44+
[15.042237585, 3.726314144, 3.781372689],
45+
[15.043102361, 3.779709379, 3.775903681],
46+
[1.681312704, 1.42827274, 1.472978356],
47+
[0.302895873, 0.086830678, 0.088743658],
48+
[0.293882291, 0.063809747, 0.062529115],
49+
[0.299454724, 0.063649213, 0.069480473],
50+
[0.398223368, 0.157227044, 0.146815297],
51+
[0.292553387, 0.047664083, 0.049640484],
52+
[0.290166436, 0.047043317, 0.046314499],
53+
[0.28371809, 0.042731207, 0.045719953]
54+
]
55+
}

datafusion-vortex/run.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
3+
# Check if an argument is provided
4+
if [ "$#" -ne 1 ]; then
5+
echo "Usage: $0 [single|partitioned]"
6+
exit 1
7+
fi
8+
9+
# Set the SQL file based on the argument
10+
if [ "$1" == "single" ] || [ "$1" == "partitioned" ]; then
11+
FLAVOR=$1
12+
echo "Running benchmark for $FLAVOR"
13+
else
14+
echo "Invalid argument. Please use 'single' or 'partitioned'."
15+
exit 1
16+
fi
17+
18+
# clear results file
19+
touch results.csv
20+
> results.csv
21+
22+
TRIES=3
23+
OS=$(uname)
24+
25+
for query_num in $(seq 0 42); do
26+
sync
27+
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
28+
29+
echo -n "["
30+
for i in $(seq 1 $TRIES); do
31+
# Parse query results out of the JSON output, which reports the time in ns
32+
RES=`RUST_LOG=off clickbench -i 1 --flavor $FLAVOR --formats vortex --display-format gh-json --queries-file ./queries.sql -q $query_num --hide-progress-bar --hide-metrics | jq ".value / 1000000000"`
33+
34+
[[ $RES != "" ]] && \
35+
echo -n "$RES" || \
36+
echo -n "null"
37+
[[ "$i" != $TRIES ]] && echo -n ", "
38+
echo "${query_num},${i},${RES}" >> results.csv
39+
done
40+
echo "],"
41+
done

datafusion/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ else
1717
fi
1818

1919
TRIES=3
20-
QUERY_NUM=1
20+
QUERY_NUM=0
2121
echo $1
2222
cat queries.sql | while read -r query; do
2323
sync

0 commit comments

Comments
 (0)