Skip to content

Commit 9b6609a

Browse files
committed
doris
update update
1 parent 2d05dd9 commit 9b6609a

File tree

14 files changed

+348
-0
lines changed

14 files changed

+348
-0
lines changed

doris/benchmark.sh

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 3 ]]; then
5+
echo "Usage: $0 <DB_NAME> <RESULT_FILE_RUNTIMES> <RESULT_FILE_MEMORY_USAGE>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
RESULT_FILE_RUNTIMES="$2"
12+
RESULT_FILE_MEMORY_USAGE="$3"
13+
14+
# Construct the query log file name using $DB_NAME
15+
QUERY_LOG_FILE="query_log.txt"
16+
17+
# Print the database name
18+
echo "Running queries on database: $DB_NAME"
19+
20+
# Run queries and log the output
21+
./run_queries.sh "$DB_NAME" 2>&1 | tee query_log.txt
22+
23+
# Process the query log and prepare the result
24+
RESULT=$(cat query_log.txt | grep -oP 'Response time: \d+\.\d+ s' | sed -r -e 's/Response time: ([0-9]+\.[0-9]+) s/\1/' | \
25+
awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }')
26+
27+
# Output the result
28+
if [[ -n "$RESULT_FILE_RUNTIMES" ]]; then
29+
echo "$RESULT" > "$RESULT_FILE_RUNTIMES"
30+
echo "Result written to $RESULT_FILE_RUNTIMES"
31+
else
32+
echo "$RESULT"
33+
fi

doris/count.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 2 ]]; then
5+
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
TABLE_NAME="$2"
12+
13+
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM $TABLE_NAME;"

doris/create_and_load.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 7 ]]; then
5+
echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DDL_FILE> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
6+
exit 1
7+
fi
8+
9+
# Arguments
10+
DB_NAME="$1"
11+
TABLE_NAME="$2"
12+
DDL_FILE="$3"
13+
DATA_DIRECTORY="$4"
14+
NUM_FILES="$5"
15+
SUCCESS_LOG="$6"
16+
ERROR_LOG="$7"
17+
18+
# Validate arguments
19+
[[ ! -f "$DDL_FILE" ]] && { echo "Error: DDL file '$DDL_FILE' does not exist."; exit 1; }
20+
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
21+
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
22+
23+
24+
echo "Create database"
25+
mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
26+
27+
echo "Execute DDL"
28+
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "$DDL_FILE"
29+
30+
echo "Load data"
31+
./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"

doris/ddl_generic.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
CREATE TABLE bluesky (
2+
kind VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data, '$.kind')) NOT NULL,
3+
operation VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data, '$.commit.operation')) NULL,
4+
collection VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data, '$.commit.collection')) NULL,
5+
did VARCHAR(100) GENERATED ALWAYS AS (get_json_string(data,'$.did')) NOT NULL,
6+
time_us BIGINT GENERATED ALWAYS AS (get_json_bigint(data, '$.time_us')) NOT NULL,
7+
`data` variant NOT NULL
8+
)
9+
DUPLICATE KEY (kind, operation, collection)
10+
DISTRIBUTED BY HASH(collection, did) BUCKETS 32
11+
PROPERTIES (
12+
"replication_num"="1"
13+
);

doris/drop_table.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 2 ]]; then
5+
echo "Usage: $0 <DB_NAME> <TABLE_NAME>"
6+
exit 1
7+
fi
8+
9+
DB_NAME="$1"
10+
TABLE_NAME="$2"
11+
12+
echo "Dropping table: $DB_NAME.$TABLE_NAME"
13+
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "DROP TABLE IF EXISTS $TABLE_NAME"

doris/install.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
wget --timestamping https://apache-doris-releases.oss-accelerate.aliyuncs.com/${DORIS_PACKAGE}.tar.gz
3+
mkdir ${DORIS_PACKAGE}
4+
tar -xvf ./${DORIS_PACKAGE}.tar.gz --strip-components 1 -C ./${DORIS_PACKAGE}
5+
6+
echo "storage_page_cache_limit=60%" >> ./${DORIS_PACKAGE}/be/conf/be.conf
7+
echo "enable_java_support=false" >> ./${DORIS_PACKAGE}/be/conf/be.conf

doris/load_data.sh

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/bin/bash
2+
3+
# Check if the required arguments are provided
4+
if [[ $# -lt 6 ]]; then
5+
echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
6+
exit 1
7+
fi
8+
9+
10+
# Arguments
11+
DATA_DIRECTORY="$1"
12+
DB_NAME="$2"
13+
TABLE_NAME="$3"
14+
MAX_FILES="$4"
15+
SUCCESS_LOG="$5"
16+
ERROR_LOG="$6"
17+
18+
# Validate arguments
19+
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
20+
[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
21+
22+
# Create a temporary directory for uncompressed files
23+
TEMP_DIR=$(mktemp -d /var/tmp/json_files.XXXXXX)
24+
trap "rm -rf $TEMP_DIR" EXIT # Cleanup temp directory on script exit
25+
26+
# Load data
27+
counter=0
28+
start=0
29+
for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
30+
echo "Processing file: $file"
31+
num=$(echo "$file" | sed -n 's/[^0-9]*\([0-9]\+\).*/\1/p')
32+
if [ "$num" -le "$start" ]; then
33+
continue
34+
fi
35+
36+
# Uncompress the file into the TEMP_DIR
37+
uncompressed_file="$TEMP_DIR/$(basename "${file%.gz}")"
38+
gunzip -c "$file" > "$uncompressed_file"
39+
40+
if [[ $? -ne 0 ]]; then
41+
echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
42+
continue
43+
fi
44+
MAX_ATTEMPT=10
45+
attempt=0
46+
while [ $attempt -lt $MAX_ATTEMPT ]
47+
do
48+
# Attempt the import
49+
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "max_filter_ratio: 0.1" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
50+
response_body="$(cat /tmp/curl_body)"
51+
response_status="$(cat /tmp/curl_body | jq -r '.Status')"
52+
echo $response_status
53+
if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
54+
if [ "$response_status" = "Success" ]
55+
then
56+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Successfully imported $file. Response: $response_body" >> "$SUCCESS_LOG"
57+
rm -f "$uncompressed_file" # Delete the uncompressed file after successful processing
58+
attempt=$((MAX_ATTEMPT))
59+
else
60+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
61+
attempt=$((attempt + 1))
62+
sleep 2
63+
fi
64+
else
65+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $attempt attempt failed for $file with status code $http_code. Response: $response_body" >> "$ERROR_LOG"
66+
attempt=$((attempt + 1))
67+
sleep 2
68+
fi
69+
done
70+
71+
counter=$((counter + 1))
72+
if [[ $counter -ge $MAX_FILES ]]; then
73+
break
74+
fi
75+
done

doris/main.sh

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/bin/bash
2+
export DORIS_PACKAGE="apache-doris-3.0.5-bin-x64"
3+
4+
DEFAULT_CHOICE=ask
5+
DEFAULT_DATA_DIRECTORY=~/data/bluesky
6+
7+
# Allow the user to optionally provide the scale factor ("choice") as an argument
8+
CHOICE="${1:-$DEFAULT_CHOICE}"
9+
10+
# Allow the user to optionally provide the data directory as an argument
11+
DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
12+
13+
# Define success and error log files
14+
SUCCESS_LOG="${3:-success.log}"
15+
ERROR_LOG="${4:-error.log}"
16+
17+
# Define prefix for output files
18+
OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
19+
20+
# Check if the directory exists
21+
if [[ ! -d "$DATA_DIRECTORY" ]]; then
22+
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
23+
exit 1
24+
fi
25+
26+
if [ "$CHOICE" = "ask" ]; then
27+
echo "Select the dataset size to benchmark:"
28+
echo "1) 1m (default)"
29+
echo "2) 10m"
30+
echo "3) 100m"
31+
echo "4) 1000m"
32+
echo "5) all"
33+
read -p "Enter the number corresponding to your choice: " CHOICE
34+
fi;
35+
36+
./install.sh
37+
./start.sh
38+
39+
benchmark() {
40+
local size=$1
41+
local suffix=$2
42+
# Check DATA_DIRECTORY contains the required number of files to run the benchmark
43+
file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
44+
if (( file_count < size )); then
45+
echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
46+
exit 1
47+
fi
48+
./create_and_load.sh "bluesky_${size}m_${suffix}" bluesky "ddl_${suffix}.sql" "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
49+
./total_size.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.total_size"
50+
./count.sh "bluesky_${size}m_${suffix}" bluesky | tee "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.count"
51+
./benchmark.sh "bluesky_${size}m_${suffix}" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_runtime" "${OUTPUT_PREFIX}_bluesky_${size}m_${suffix}.results_memory_usage"
52+
./drop_table.sh "bluesky_${size}m_${suffix}" bluesky
53+
}
54+
55+
case $CHOICE in
56+
2)
57+
benchmark 10 generic
58+
;;
59+
3)
60+
benchmark 100 generic
61+
;;
62+
4)
63+
benchmark 1000 generic
64+
;;
65+
5)
66+
benchmark 1 generic
67+
benchmark 10 generic
68+
benchmark 100 generic
69+
benchmark 1000 generic
70+
;;
71+
*)
72+
benchmark 1 generic
73+
;;
74+
esac
75+
76+
./uninstall.sh

doris/queries.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
SELECT collection AS event, COUNT(*) AS count FROM bluesky GROUP BY event ORDER BY count DESC;
2+
SELECT collection AS event, COUNT(*) AS count, COUNT(DISTINCT did) AS users FROM bluesky WHERE kind = 'commit' AND operation = 'create' GROUP BY event ORDER BY count DESC;
3+
SELECT collection AS event, HOUR(from_microsecond(time_us)) AS hour_of_day, COUNT(*) AS count FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection IN ('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like') GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4+
SELECT did AS user_id, MIN(from_microsecond(time_us)) AS first_post_ts FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY first_post_ts ASC LIMIT 3;
5+
SELECT did AS user_id, MILLISECONDS_DIFF(MAX(from_microsecond(time_us)),MIN(from_microsecond(time_us))) AS activity_span FROM bluesky WHERE kind = 'commit' AND operation = 'create' AND collection = 'app.bsky.feed.post' GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{
2+
"system": "Doris",
3+
"version": "3.0.5",
4+
"os": "Ubuntu 24.04",
5+
"date": "2025-04-30",
6+
"machine": "m6i.8xlarge, 16000gib gp3",
7+
"retains_structure": "yes",
8+
"tags": [
9+
],
10+
"dataset_size": 1000000000,
11+
"num_loaded_documents": 999999245,
12+
"total_size": 215190746431,
13+
"result": [
14+
[1.69,1.46,1.47],
15+
[90.34,4.07,4.08],
16+
[15.45,4.28,4.27],
17+
[0.84,0.82,0.83],
18+
[0.86,0.89,0.88]
19+
]
20+
}

0 commit comments

Comments
 (0)