Skip to content

Commit c0c07a8

Browse files
committed
init
1 parent 177204c commit c0c07a8

File tree

5 files changed

+238
-0
lines changed

5 files changed

+238
-0
lines changed

datafusion-vortex/benchmark.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
3+
# Install Rust
4+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
5+
bash rust-init.sh -y
6+
source ~/.cargo/env
7+
8+
9+
# Install Dependencies
10+
sudo yum update -y
11+
sudo yum install gcc jq -y
12+
13+
14+
# Install Vortex from latest release main branch
15+
git clone https://github.com/spiraldb/vortex.git
16+
cd vortex
17+
git checkout 0.28.0
18+
CARGO_PROFILE_RELEASE_LTO=true RUSTFLAGS="-C codegen-units=1" cargo build --release --bin clickbench -p bench-vortex
19+
export PATH="`pwd`/target/release:$PATH"
20+
cd ..
21+
22+
# Vortex's benchmarking utility generates appropriate Vortex files by itself, so we just run it to make sure they exist before we start measuring
23+
RUST_LOG=off clickbench -i 1 --flavor single --formats vortex --display-format gh-json -q 0 --hide-progress-bar --hide-metrics > /dev/null
24+
# Vortex's benchmarking utility generates appropriate Vortex files by itself, so we just run it to make sure they exist before we start measuring
25+
RUST_LOG=off clickbench -i 1 --flavor partitioned --formats vortex --display-format gh-json -q 0 --hide-progress-bar --hide-metrics > /dev/null
26+
27+
# Run benchmarks for single parquet and partitioned, our CLI generates the relevant vortex files.
28+
./run.sh single
29+
./run.sh partitioned
30+

datafusion-vortex/queries.sql

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
SELECT COUNT(*) FROM hits;
2+
SELECT COUNT(*) FROM hits WHERE "AdvEngineID" <> 0;
3+
SELECT SUM("AdvEngineID"), COUNT(*), AVG("ResolutionWidth") FROM hits;
4+
SELECT AVG("UserID") FROM hits;
5+
SELECT COUNT(DISTINCT "UserID") FROM hits;
6+
SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
7+
SELECT MIN("EventDate"::INT::DATE), MAX("EventDate"::INT::DATE) FROM hits;
8+
SELECT "AdvEngineID", COUNT(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY COUNT(*) DESC;
9+
SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
10+
SELECT "RegionID", SUM("AdvEngineID"), COUNT(*) AS c, AVG("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
11+
SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
12+
SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
13+
SELECT "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
14+
SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
15+
SELECT "SearchEngineID", "SearchPhrase", COUNT(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
16+
SELECT "UserID", COUNT(*) FROM hits GROUP BY "UserID" ORDER BY COUNT(*) DESC LIMIT 10;
17+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
18+
SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
19+
SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10;
20+
SELECT "UserID" FROM hits WHERE "UserID" = 435090932899640449;
21+
SELECT COUNT(*) FROM hits WHERE "URL" LIKE '%google%';
22+
SELECT "SearchPhrase", MIN("URL"), COUNT(*) AS c FROM hits WHERE "URL" LIKE '%google%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
23+
SELECT "SearchPhrase", MIN("URL"), MIN("Title"), COUNT(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Google%' AND "URL" NOT LIKE '%.google.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
24+
SELECT * FROM hits WHERE "URL" LIKE '%google%' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10;
25+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime") LIMIT 10;
26+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
27+
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY to_timestamp_seconds("EventTime"), "SearchPhrase" LIMIT 10;
28+
SELECT "CounterID", AVG(length("URL")) AS l, COUNT(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
29+
SELECT REGEXP_REPLACE("Referer", '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length("Referer")) AS l, COUNT(*) AS c, MIN("Referer") FROM hits WHERE "Referer" <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
30+
SELECT SUM("ResolutionWidth"), SUM("ResolutionWidth" + 1), SUM("ResolutionWidth" + 2), SUM("ResolutionWidth" + 3), SUM("ResolutionWidth" + 4), SUM("ResolutionWidth" + 5), SUM("ResolutionWidth" + 6), SUM("ResolutionWidth" + 7), SUM("ResolutionWidth" + 8), SUM("ResolutionWidth" + 9), SUM("ResolutionWidth" + 10), SUM("ResolutionWidth" + 11), SUM("ResolutionWidth" + 12), SUM("ResolutionWidth" + 13), SUM("ResolutionWidth" + 14), SUM("ResolutionWidth" + 15), SUM("ResolutionWidth" + 16), SUM("ResolutionWidth" + 17), SUM("ResolutionWidth" + 18), SUM("ResolutionWidth" + 19), SUM("ResolutionWidth" + 20), SUM("ResolutionWidth" + 21), SUM("ResolutionWidth" + 22), SUM("ResolutionWidth" + 23), SUM("ResolutionWidth" + 24), SUM("ResolutionWidth" + 25), SUM("ResolutionWidth" + 26), SUM("ResolutionWidth" + 27), SUM("ResolutionWidth" + 28), SUM("ResolutionWidth" + 29), SUM("ResolutionWidth" + 30), SUM("ResolutionWidth" + 31), SUM("ResolutionWidth" + 32), SUM("ResolutionWidth" + 33), SUM("ResolutionWidth" + 34), SUM("ResolutionWidth" + 35), SUM("ResolutionWidth" + 36), SUM("ResolutionWidth" + 37), SUM("ResolutionWidth" + 38), SUM("ResolutionWidth" + 39), SUM("ResolutionWidth" + 40), SUM("ResolutionWidth" + 41), SUM("ResolutionWidth" + 42), SUM("ResolutionWidth" + 43), SUM("ResolutionWidth" + 44), SUM("ResolutionWidth" + 45), SUM("ResolutionWidth" + 46), SUM("ResolutionWidth" + 47), SUM("ResolutionWidth" + 48), SUM("ResolutionWidth" + 49), SUM("ResolutionWidth" + 50), SUM("ResolutionWidth" + 51), SUM("ResolutionWidth" + 52), SUM("ResolutionWidth" + 53), SUM("ResolutionWidth" + 54), SUM("ResolutionWidth" + 55), SUM("ResolutionWidth" + 56), SUM("ResolutionWidth" + 57), SUM("ResolutionWidth" + 58), SUM("ResolutionWidth" + 59), SUM("ResolutionWidth" + 60), SUM("ResolutionWidth" + 61), SUM("ResolutionWidth" + 62), SUM("ResolutionWidth" + 63), SUM("ResolutionWidth" + 64), SUM("ResolutionWidth" + 65), SUM("ResolutionWidth" + 66), SUM("ResolutionWidth" + 67), SUM("ResolutionWidth" + 68), SUM("ResolutionWidth" + 69), SUM("ResolutionWidth" + 70), SUM("ResolutionWidth" + 71), SUM("ResolutionWidth" + 72), SUM("ResolutionWidth" + 73), SUM("ResolutionWidth" + 74), SUM("ResolutionWidth" + 75), SUM("ResolutionWidth" + 76), SUM("ResolutionWidth" + 77), SUM("ResolutionWidth" + 78), SUM("ResolutionWidth" + 79), SUM("ResolutionWidth" + 80), SUM("ResolutionWidth" + 81), SUM("ResolutionWidth" + 82), SUM("ResolutionWidth" + 83), SUM("ResolutionWidth" + 84), SUM("ResolutionWidth" + 85), SUM("ResolutionWidth" + 86), SUM("ResolutionWidth" + 87), SUM("ResolutionWidth" + 88), SUM("ResolutionWidth" + 89) FROM hits;
31+
SELECT "SearchEngineID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
32+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
33+
SELECT "WatchID", "ClientIP", COUNT(*) AS c, SUM("IsRefresh"), AVG("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
34+
SELECT "URL", COUNT(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
35+
SELECT 1, "URL", COUNT(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
36+
SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, COUNT(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
37+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10;
38+
SELECT "Title", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "DontCountHits" = 0 AND "IsRefresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY PageViews DESC LIMIT 10;
39+
SELECT "URL", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
40+
SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
41+
SELECT "URLHash", "EventDate"::INT::DATE, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 3594120000172545465 GROUP BY "URLHash", "EventDate"::INT::DATE ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
42+
SELECT "WindowClientWidth", "WindowClientHeight", COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-01' AND "EventDate"::INT::DATE <= '2013-07-31' AND "IsRefresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 2868770270353813622 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43+
SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-14' AND "EventDate"::INT::DATE <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000;
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"system": "DataFusion (Vortex, partitioned)",
3+
"date": "2024-04-15",
4+
"machine": "c6a.4xlarge, 500gb gp2",
5+
"cluster_size": 1,
6+
"comment": "v46.0.0 (26058ac) - Vortex 0.28",
7+
"tags": ["Rust", "column-oriented", "embedded", "stateless"],
8+
"load_time": 0,
9+
"data_size": 14779976446,
10+
11+
"result": [
12+
[0.059, 0.020, 0.021],
13+
[0.103, 0.034, 0.032],
14+
[0.188, 0.081, 0.077],
15+
[0.387, 0.089, 0.081],
16+
[0.976, 0.790, 0.792],
17+
[0.982, 0.793, 0.801],
18+
[0.093, 0.031, 0.031],
19+
[0.118, 0.037, 0.038],
20+
[0.997, 0.869, 0.861],
21+
[1.313, 0.987, 0.985],
22+
[0.530, 0.241, 0.250],
23+
[0.607, 0.273, 0.273],
24+
[1.047, 0.849, 0.869],
25+
[2.534, 1.343, 1.203],
26+
[1.095, 0.816, 0.792],
27+
[1.038, 0.952, 0.942],
28+
[2.586, 1.690, 1.710],
29+
[2.500, 1.585, 1.585],
30+
[5.162, 3.475, 3.434],
31+
[0.288, 0.074, 0.073],
32+
[9.896, 1.061, 1.038],
33+
[11.254, 1.246, 1.283],
34+
[21.845, 2.537, 2.541],
35+
[55.438, 9.532, 9.583],
36+
[2.700, 0.436, 0.451],
37+
[0.811, 0.367, 0.355],
38+
[2.702, 0.519, 0.501],
39+
[9.636, 1.464, 1.437],
40+
[9.892, 9.399, 9.500],
41+
[0.521, 0.435, 0.436],
42+
[2.390, 0.751, 0.751],
43+
[5.944, 0.891, 0.890],
44+
[4.703, 3.474, 3.364],
45+
[10.206, 3.615, 3.636],
46+
[10.171, 3.663, 3.631],
47+
[1.289, 1.150, 1.158],
48+
[0.394, 0.188, 0.185],
49+
[0.215, 0.081, 0.080],
50+
[0.289, 0.109, 0.115],
51+
[0.633, 0.346, 0.344],
52+
[0.170, 0.044, 0.045],
53+
[0.159, 0.038, 0.042],
54+
[0.158, 0.052, 0.051]
55+
]
56+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
{
2+
"system": "DataFusion (Vortex, single)",
3+
"date": "2024-04-15",
4+
"machine": "c6a.4xlarge, 500gb gp2",
5+
"cluster_size": 1,
6+
"comment": "v45.0.0 (26058ac) - Vortex 0.28",
7+
8+
"tags": ["Rust", "column-oriented", "embedded", "stateless"],
9+
10+
"load_time": 0,
11+
"data_size": 14779976446,
12+
13+
"result": [
14+
[0.091, 0.059, 0.053],
15+
[0.145, 0.074, 0.067],
16+
[0.204, 0.115, 0.109],
17+
[0.353, 0.112, 0.113],
18+
[0.951, 0.851, 0.838],
19+
[1.033, 0.899, 0.901],
20+
[0.120, 0.066, 0.068],
21+
[0.141, 0.083, 0.077],
22+
[1.020, 0.901, 0.901],
23+
[1.288, 1.018, 1.046],
24+
[0.461, 0.266, 0.277],
25+
[0.545, 0.301, 0.308],
26+
[1.097, 0.931, 0.938],
27+
[2.596, 1.315, 1.314],
28+
[1.048, 0.891, 0.877],
29+
[1.090, 0.973, 0.974],
30+
[2.591, 1.797, 1.818],
31+
[2.503, 1.661, 1.668],
32+
[5.074, 3.473, 3.469],
33+
[0.270, 0.107, 0.110],
34+
[9.744, 1.121, 1.111],
35+
[11.237, 1.445, 1.423],
36+
[22.074, 3.566, 3.478],
37+
[55.972, 9.819, 9.872],
38+
[2.579, 0.567, 0.560],
39+
[0.814, 0.492, 0.493],
40+
[2.576, 0.659, 0.632],
41+
[9.598, 1.572, 1.552],
42+
[10.668, 10.012, 10.155],
43+
[0.562, 0.473, 0.465],
44+
[2.281, 0.860, 0.873],
45+
[5.695, 0.991, 0.946],
46+
[4.506, 3.428, 3.497],
47+
[10.139, 3.779, 3.859],
48+
[10.091, 3.804, 3.776],
49+
[1.315, 1.191, 1.235],
50+
[0.432, 0.234, 0.238],
51+
[0.275, 0.156, 0.156],
52+
[0.342, 0.155, 0.157],
53+
[0.667, 0.423, 0.405],
54+
[0.202, 0.080, 0.078],
55+
[0.186, 0.075, 0.076],
56+
[0.187, 0.099, 0.084]
57+
]
58+
}

datafusion-vortex/run.sh

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/bin/bash
2+
3+
# Check if an argument is provided
4+
if [ "$#" -ne 1 ]; then
5+
echo "Usage: $0 [single|partitioned]"
6+
exit 1
7+
fi
8+
9+
# Set the SQL file based on the argument
10+
if [ "$1" = "single" ]; then
11+
FLAVOR="single"
12+
elif [ "$1" = "partitioned" ]; then
13+
FLAVOR="partitioned"
14+
else
15+
echo "Invalid argument. Please use 'single' or 'partitioned'."
16+
exit 1
17+
fi
18+
19+
# clear results file
20+
touch results.csv
21+
> results.csv
22+
23+
TRIES=3
24+
QUERY_NUM=0
25+
echo $1
26+
OS=$(uname)
27+
28+
cat queries.sql | while read -r query; do
29+
sync
30+
if [[ $OS != "Darwin" ]]
31+
then
32+
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
33+
fi
34+
35+
echo "$query" > /tmp/query.sql
36+
37+
echo -n "["
38+
for i in $(seq 1 $TRIES); do
39+
# Parse query results out of JSON, which reports the time in ns
40+
RES=`RUST_LOG=off clickbench -i 1 --flavor $FLAVOR --formats vortex --display-format gh-json -q $QUERY_NUM --hide-progress-bar --hide-metrics | jq ".value / 1000000"`
41+
42+
[[ $RES != "" ]] && \
43+
echo -n "$RES" || \
44+
echo -n "null"
45+
[[ "$i" != $TRIES ]] && echo -n ", "
46+
echo "${QUERY_NUM},${i},${RES}" >> results.csv
47+
done
48+
echo "],"
49+
50+
QUERY_NUM=$((QUERY_NUM + 1))
51+
done

0 commit comments

Comments
 (0)