Skip to content

Commit 12303f5

Browse files
committed
Add benchmarks for querying Vortex files with datafusion
1 parent 177204c commit 12303f5

File tree

6 files changed

+571
-1
lines changed

6 files changed

+571
-1
lines changed

datafusion-vortex/benchmark.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
# Install Rust
6+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
7+
bash rust-init.sh -y
8+
source ~/.cargo/env
9+
10+
# Install Dependencies
11+
sudo apt-get update
12+
sudo apt-get install --yes gcc jq build-essential
13+
14+
# Install Vortex from latest release main branch
15+
git clone https://github.com/spiraldb/vortex.git
16+
cd vortex
17+
git checkout 0.29.0
18+
git submodule update --init
19+
# We build a release version of the benchmarking utility using mimalloc, just like the datafusion-cli
20+
cargo build --release --bin clickbench --package bench-vortex --features mimalloc
21+
export PATH="`pwd`/target/release:$PATH"
22+
cd ..
23+
24+
# Vortex's benchmarking utility generates appropriate Vortex files by itself, so we just run it to make sure they exist before we start measuring.
25+
# This will download parquet files (with time and string columns already converted to the logically correct datatype) and generate Vortex files from them.
26+
clickbench -i 1 --flavor single --formats vortex --display-format gh-json -q 0 --hide-progress-bar --hide-metrics
27+
clickbench -i 1 --flavor partitioned --formats vortex --display-format gh-json -q 0 --hide-progress-bar --hide-metrics
28+
29+
# Run benchmarks for single parquet and partitioned, our CLI generates the relevant vortex files.
30+
./run.sh single
31+
./run.sh partitioned
32+

datafusion-vortex/queries.sql

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
SELECT COUNT(*) FROM hits;
2+
SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
3+
SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
4+
SELECT AVG("UserID") FROM hits;
5+
SELECT COUNT(DISTINCT "UserID") FROM hits;
6+
SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
7+
SELECT MIN("EventDate"), MAX("EventDate") FROM hits;
8+
SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
9+
SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
10+
SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
11+
SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
12+
SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
13+
SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
14+
SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
15+
SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
16+
SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
17+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
18+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
19+
SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
20+
SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
21+
SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
22+
SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
23+
SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
24+
SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY "EventTime" LIMIT 10;
25+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
26+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
27+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
28+
SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
29+
SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
30+
SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
31+
SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
32+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
33+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
34+
SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
35+
SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
36+
SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
37+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
38+
SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
39+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
40+
SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
41+
SELECT "URLHash", "EventDate", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate" ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
42+
SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43+
SELECT DATE_TRUNC('minute', "EventTime") AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-14' AND "EventDate" <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
{
2+
"system": "DataFusion (Vortex, partitioned)",
3+
"date": "2024-04-17",
4+
"machine": "c6a.4xlarge, 500gb gp2",
5+
"cluster_size": 1,
6+
"comment": "v46.0.0 (26058ac) - Vortex 0.28",
7+
"tags": [
8+
"Rust",
9+
"column-oriented",
10+
"embedded",
11+
"stateless"
12+
],
13+
"load_time": 0,
14+
"data_size": 17429875344,
15+
"result": [
16+
[
17+
0.215567151,
18+
0.046278166,
19+
0.043601747
20+
],
21+
[
22+
0.274220936,
23+
0.062314187,
24+
0.066048662
25+
],
26+
[
27+
0.371992732,
28+
0.10539287,
29+
0.108197763
30+
],
31+
[
32+
1.239432059,
33+
0.171779962,
34+
0.169550135
35+
],
36+
[
37+
1.559370457,
38+
0.866417841,
39+
0.88394495
40+
],
41+
[
42+
1.222848924,
43+
0.840264801,
44+
0.851401376
45+
],
46+
[
47+
0.228422624,
48+
0.047651611,
49+
0.046566764
50+
],
51+
[
52+
0.304735748,
53+
0.071698597,
54+
0.065915069
55+
],
56+
[
57+
2.018031574,
58+
1.072580817,
59+
1.078097929
60+
],
61+
[
62+
2.42139936,
63+
1.050517562,
64+
1.069757195
65+
],
66+
[
67+
1.369403369,
68+
0.181763231,
69+
0.194613491
70+
],
71+
[
72+
1.411437838,
73+
0.211869484,
74+
0.203208881
75+
],
76+
[
77+
1.255129659,
78+
0.69719526,
79+
0.710736834
80+
],
81+
[
82+
3.341412586,
83+
1.233998312,
84+
1.160074919
85+
],
86+
[
87+
1.323241155,
88+
0.685787721,
89+
0.678354981
90+
],
91+
[
92+
1.704650147,
93+
1.004650592,
94+
1.022431137
95+
],
96+
[
97+
3.611858953,
98+
2.031219486,
99+
2.101332414
100+
],
101+
[
102+
3.477399784,
103+
1.989789673,
104+
2.044240489
105+
],
106+
[
107+
4.663775794,
108+
3.358592207,
109+
3.319511944
110+
],
111+
[
112+
0.857805683,
113+
0.099160654,
114+
0.107769924
115+
],
116+
[
117+
12.922699049,
118+
0.692098242,
119+
0.703772862
120+
],
121+
[
122+
13.968046449,
123+
0.783625241,
124+
0.768264246
125+
],
126+
[
127+
19.889260919,
128+
1.218880835,
129+
1.192464516
130+
],
131+
[
132+
52.273858091,
133+
2.486884669,
134+
2.502219491
135+
],
136+
[
137+
2.017464136,
138+
0.270297829,
139+
0.274953439
140+
],
141+
[
142+
1.103680073,
143+
0.256918376,
144+
0.267822425
145+
],
146+
[
147+
2.023380428,
148+
0.3483503,
149+
0.347485008
150+
],
151+
[
152+
12.837263353,
153+
1.284630463,
154+
1.293063698
155+
],
156+
[
157+
11.461790032,
158+
8.81546356,
159+
8.988050522
160+
],
161+
[
162+
0.785249741,
163+
0.538045907,
164+
0.53664922
165+
],
166+
[
167+
2.797090584,
168+
0.59572825,
169+
0.59883556
170+
],
171+
[
172+
5.952816687,
173+
0.662956918,
174+
0.684422986
175+
],
176+
[
177+
4.654727265,
178+
3.729290208,
179+
3.712276033
180+
],
181+
[
182+
13.154377707,
183+
3.571075149,
184+
3.56755644
185+
],
186+
[
187+
13.232677744,
188+
3.583967462,
189+
3.558374194
190+
],
191+
[
192+
1.666283211,
193+
1.434008173,
194+
1.438127888
195+
],
196+
[
197+
0.33282251,
198+
0.119020662,
199+
0.116869478
200+
],
201+
[
202+
0.304291802,
203+
0.079318691,
204+
0.077934205
205+
],
206+
[
207+
0.31124958,
208+
0.071654016,
209+
0.080335567
210+
],
211+
[
212+
0.429109382,
213+
0.21039532,
214+
0.217028899
215+
],
216+
[
217+
0.292319557,
218+
0.066677619,
219+
0.064420702
220+
],
221+
[
222+
0.290753118,
223+
0.060678557,
224+
0.06512376
225+
]
226+
]
227+
}

0 commit comments

Comments
 (0)