Skip to content

Commit 92302e2

Browse files
authored
Merge pull request #73 from BiteTheDDDDt/main
Add Apache Doris
2 parents e2f6952 + 248810c commit 92302e2

28 files changed

+522
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ While the main benchmark uses a specific machine configuration for reproducibili
138138
- [x] SingleStore
139139
- [x] GreptimeDB
140140
- [x] FerretDB
141+
- [x] Apache Doris
141142
- [ ] Quickwit
142143
- [ ] Meilisearch
143144
- [ ] Sneller

doris/benchmark.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
3+
# If you change something in this file, please change also in starrocks/benchmark.sh.
4+
5+
# Check if the required arguments are provided
6+
if [[ $# -lt 3 ]]; then
7+
echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES> <QUERIES_FILE>"
8+
exit 1
9+
fi
10+
11+
# Arguments
12+
DB_NAME="$1"
13+
RESULT_FILE_RUNTIMES="$2"
14+
QUERIES_FILE="$3"
15+
16+
# Construct the query log file name using $DB_NAME
17+
QUERY_LOG_FILE="query_log.txt"
18+
19+
# Print the database name
20+
echo "Running queries on database: $DB_NAME"
21+
22+
# Run queries and log the output
23+
./run_queries.sh "$DB_NAME" "$QUERIES_FILE" 2>&1 | tee query_log.txt
24+
25+
# Process the query log and prepare the result
26+
RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
27+
awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
28+
29+
# Output the result
30+
if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then
31+
echo "$RESULT" > "$RESULT_FILE_RUNTIMES"
32+
echo "Result written to $RESULT_FILE_RUNTIMES"
33+
else
34+
echo "$RESULT"
35+
fi

doris/count.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
# If you change something in this file, please change also in starrocks/count.sh.
4+
5+
# Check if the required arguments are provided
6+
if [[ $# -lt 2 ]]; then
7+
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
8+
exit 1
9+
fi
10+
11+
# Arguments
12+
DB_NAME="$1"
13+
TABLE_NAME="$2"
14+
15+
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM $TABLE_NAME;"

doris/create_and_load.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 6 ]]; then
5+
echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
TABLE_NAME="$2"
12+
DATA_DIRECTORY="$3"
13+
NUM_FILES="$4"
14+
SUCCESS_LOG="$5"
15+
ERROR_LOG="$6"
16+
17+
# Validate arguments
18+
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
19+
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
20+
21+
22+
echo "Create database"
23+
mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
24+
25+
echo "Execute DDL"
26+
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "ddl.sql"
27+
28+
echo "Load data"
29+
./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"
30+
31+
echo "Sleep 120 sec to collect data size"
32+
sleep 120s

doris/ddl.sql

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
CREATE TABLE bluesky (
2+
`id` BIGINT NOT NULL AUTO_INCREMENT,
3+
`data` variant NOT NULL
4+
)
5+
DISTRIBUTED BY HASH(id) BUCKETS 32
6+
PROPERTIES (
7+
"replication_num"="1"
8+
);

doris/drop_table.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
# If you change something in this file, please change also in starrocks/drop_table.sh.
4+
5+
# Check if the required arguments are provided
6+
if [[ $# -lt 2 ]]; then
7+
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
8+
exit 1
9+
fi
10+
11+
DB_NAME="$1"
12+
TABLE_NAME="$2"
13+
14+
echo "Dropping table: $DB_NAME.$TABLE_NAME"
15+
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "DROP TABLE IF EXISTS $TABLE_NAME"

doris/install.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
3+
wget https://apache-doris-releases.oss-accelerate.aliyuncs.com/${DORIS_FULL_NAME}.tar.gz
4+
mkdir ${DORIS_FULL_NAME}
5+
tar -xvf ${DORIS_FULL_NAME}.tar.gz --strip-components 1 -C ${DORIS_FULL_NAME}
6+
7+
sudo apt-get update
8+
sudo apt-get install -y mysql-client openjdk-17-jre-headless # somehow _EXACTLY_ v17 is needed

doris/load_data.sh

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 6 ]]; then
5+
echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
6+
exit 1
7+
fi
8+
9+
10+
# Arguments
11+
DATA_DIRECTORY="$1"
12+
DB_NAME="$2"
13+
TABLE_NAME="$3"
14+
MAX_FILES="$4"
15+
SUCCESS_LOG="$5"
16+
ERROR_LOG="$6"
17+
18+
# Validate arguments
19+
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
20+
[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
21+
22+
# Create a temporary directory for uncompressed files
23+
TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
24+
trap "rm -rf $TEMP_DIR" EXIT # Cleanup temp directory on script exit
25+
26+
# Load data
27+
counter=0
28+
start=0
29+
for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
30+
echo "Processing file: $file"
31+
num=$(echo "$file" | sed -n 's/[^0-9]*\([0-9]\+\).*/\1/p')
32+
if [ "$num" -le "$start" ]; then
33+
continue
34+
fi
35+
36+
# Uncompress the file into the TEMP_DIR
37+
uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
38+
gunzip -c "$file" > "$uncompressed_file"
39+
40+
if [[ $? -ne 0 ]]; then
41+
echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
42+
continue
43+
fi
44+
MAX_ATTEMPT=10
45+
attempt=0
46+
while [ $attempt -lt $MAX_ATTEMPT ]
47+
do
48+
# Attempt the import
49+
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "max_filter_ratio: 0.1" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
50+
response_body="$(cat /tmp/curl_body)"
51+
response_status="$(cat /tmp/curl_body | jq -r '.Status')"
52+
echo $response_status
53+
if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
54+
if [ "$response_status" = "Success" ]
55+
then
56+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file. Response: $response_body" >> "$SUCCESS_LOG"
57+
rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing
58+
attempt=$((MAX_ATTEMPT))
59+
else
60+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
61+
attempt=$((attempt + 1))
62+
sleep 2
63+
fi
64+
else
65+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
66+
attempt=$((attempt + 1))
67+
sleep 2
68+
fi
69+
done
70+
71+
counter=$((counter + 1))
72+
if [[ $counter -ge $MAX_FILES ]]; then
73+
break
74+
fi
75+
done

doris/main.sh

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/bin/bash
2+
3+
# If you change something in this file, please change also in starrocks/main.sh.
4+
5+
export DORIS_FULL_NAME="apache-doris-3.0.5-bin-x64"
6+
7+
DEFAULT_CHOICE=ask
8+
DEFAULT_DATA_DIRECTORY=~/data/bluesky
9+
10+
# Allow the user to optionally provide the scale factor ("choice") as an argument
11+
CHOICE="${1:-$DEFAULT_CHOICE}"
12+
13+
# Allow the user to optionally provide the data directory as an argument
14+
DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
15+
16+
# Define success and error log files
17+
SUCCESS_LOG="${3:-success.log}"
18+
ERROR_LOG="${4:-error.log}"
19+
20+
# Define prefix for output files
21+
OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
22+
23+
# Check if the directory exists
24+
if [[ ! -d "$DATA_DIRECTORY" ]]; then
25+
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
26+
exit 1
27+
fi
28+
29+
if [ "$CHOICE" = "ask" ]; then
30+
echo "Select the dataset size to benchmark:"
31+
echo "1) 1m (default)"
32+
echo "2) 10m"
33+
echo "3) 100m"
34+
echo "4) 1000m"
35+
echo "5) all"
36+
read -p "Enter the number corresponding to your choice: " CHOICE
37+
fi;
38+
39+
./install.sh
40+
./start.sh
41+
42+
benchmark() {
43+
local size=$1
44+
# Check DATA_DIRECTORY contains the required number of files to run the benchmark
45+
file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
46+
if (( file_count < size )); then
47+
echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
48+
exit 1
49+
fi
50+
./create_and_load.sh "bluesky_${size}m" bluesky "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
51+
./total_size.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
52+
./count.sh "bluesky_${size}m" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
53+
./benchmark.sh "bluesky_${size}m" "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime" "queries.sql"
54+
./drop_table.sh "bluesky_${size}m" bluesky
55+
}
56+
57+
case $CHOICE in
58+
2)
59+
benchmark 10
60+
;;
61+
3)
62+
benchmark 100
63+
;;
64+
4)
65+
benchmark 1000
66+
;;
67+
5)
68+
benchmark 1
69+
benchmark 10
70+
benchmark 100
71+
benchmark 1000
72+
;;
73+
*)
74+
benchmark 1
75+
;;
76+
esac
77+
78+
./stop.sh
79+
./uninstall.sh

doris/queries_default.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
SELECT cast(data['commit']['collection'] AS TEXT ) AS event, COUNT(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2+
SELECT cast(data['commit']['collection'] AS TEXT ) AS event, COUNT(*) AS count, COUNT(DISTINCT cast(data['did'] AS TEXT )) AS users FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' GROUP BY event ORDER BY count DESC;
3+
SELECT cast(data['commit']['collection'] AS TEXT ) AS event, HOUR(from_microsecond(CAST(data['time_us'] AS BIGINT))) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4+
SELECT cast(data['did'] AS TEXT ) AS user_id, MIN(from_microsecond(CAST(data['time_us'] AS BIGINT))) AS first_post_ts FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5+
SELECT cast(data['did'] AS TEXT ) AS user_id, MILLISECONDS_DIFF(MAX(from_microsecond(CAST(data['time_us'] AS BIGINT))),MIN(from_microsecond(CAST(data['time_us'] AS BIGINT)))) AS activity_span FROM bluesky WHERE cast(data['kind'] AS TEXT ) = 'commit' AND cast(data['commit']['operation'] AS TEXT ) = 'create' AND cast(data['commit']['collection'] AS TEXT ) = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;

0 commit comments

Comments
 (0)