Skip to content

Commit 9098bd3

Browse files
authored
Merge pull request #85 from murphyatwork/murphy_sr_4.0.0
update starrocks result to 4.0.0-rc01
2 parents a1389f4 + a0e6e67 commit 9098bd3

16 files changed

+114
-72
lines changed

starrocks/count.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ fi
1212
DB_NAME="$1"
1313
TABLE_NAME="$2"
1414

15-
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM $TABLE_NAME;"
15+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SELECT count() FROM $TABLE_NAME;"

starrocks/create_and_load.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,18 @@ DATA_DIRECTORY="$3"
1515
NUM_FILES="$4"
1616
SUCCESS_LOG="$5"
1717
ERROR_LOG="$6"
18+
DDL_FILE="ddl.sql"
1819

1920
# Validate arguments
2021
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
2122
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
2223

2324

2425
echo "Create database"
25-
mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
26+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
2627

2728
echo "Execute DDL"
28-
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "ddl.sql"
29+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" < "$DDL_FILE"
2930

3031
echo "Load data"
3132
./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"

starrocks/ddl.sql

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
11
CREATE TABLE bluesky (
22
`id` BIGINT AUTO_INCREMENT,
3-
`data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON"
4-
);
3+
`data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON",
4+
5+
sort_key VARBINARY AS encode_sort_key(
6+
get_json_string(data, 'kind'),
7+
get_json_string(data, 'commit.operation'),
8+
get_json_string(data, 'commit.collection'),
9+
get_json_string(data, 'did')
10+
)
11+
)
12+
ORDER BY (sort_key);

starrocks/drop_table.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ DB_NAME="$1"
1212
TABLE_NAME="$2"
1313

1414
echo "Dropping table: $DB_NAME.$TABLE_NAME"
15-
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "DROP TABLE IF EXISTS $TABLE_NAME"
15+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "DROP TABLE IF EXISTS $TABLE_NAME"

starrocks/install.sh

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,21 @@ sudo snap install docker
44
sudo apt-get update
55
sudo apt-get install -y mysql-client
66

7-
docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name starrocks starrocks/allin1-ubuntu
7+
docker run -p 9030:9030 -p 8030:8030 -p 8040:8040 -itd --name starrocks starrocks/allin1-ubuntu:4.0.0-rc01
8+
9+
echo "Starting StarRocks container..."
10+
sleep 5
11+
12+
# Monitor logs until "Enjoy" appears
13+
echo "Monitoring container logs for 'Enjoy' message..."
14+
timeout 300 docker logs -f starrocks | while read line; do
15+
echo "$line"
16+
if echo "$line" | grep -q "Enjoy"; then
17+
echo "Found 'Enjoy' message! Container is ready."
18+
# Kill the docker logs process
19+
pkill -f "docker logs -f starrocks"
20+
break
21+
fi
22+
done
23+
24+
echo "StarRocks started successfully."

starrocks/load_data.sh

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,25 @@ for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
3636
echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
3737
continue
3838
fi
39-
MAX_ATTEMPT=10
39+
40+
MAX_ATTEMPT=1
4041
attempt=0
4142
while [ $attempt -lt $MAX_ATTEMPT ]
4243
do
43-
# Attempt the import
44-
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
45-
response_body="$(cat /tmp/curl_body)"
46-
response_status="$(cat /tmp/curl_body | jq -r '.Status')"
44+
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body_$$) \
45+
--location-trusted -u root: \
46+
-H "max_filter_ratio: 0.00001" \
47+
-H "strict_mode: true" \
48+
-H "Expect:100-continue" \
49+
-T "$uncompressed_file" \
50+
-XPUT http://${DB_HOST}:${DB_HTTP_PORT}/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
51+
response_body="$(cat /tmp/curl_body_$$)"
52+
if jq -e . >/dev/null 2>&1 < /tmp/curl_body_$$; then
53+
response_status="$(jq -r '.Status' < /tmp/curl_body_$$)"
54+
else
55+
response_status=""
56+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Invalid JSON response for $file: $(cat /tmp/curl_body_$$)" >> "$ERROR_LOG"
57+
fi
4758
echo $response_status
4859
if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
4960
if [ "$response_status" = "Success" ]

starrocks/main.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ ERROR_LOG="${4:-error.log}"
1818
# Define prefix for output files
1919
OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
2020

21+
export DB_HOST="127.0.0.1"
22+
export DB_USER="root"
23+
export DB_MYSQL_PORT="9030"
24+
export DB_HTTP_PORT="8030" # HTTP endpoint for stream load
25+
2126
# Check if the directory exists
2227
if [[ ! -d "$DATA_DIRECTORY" ]]; then
2328
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."

starrocks/physical_query_plans.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ cat queries.sql | while read -r query; do
1717
echo "------------------------------------------------------------------------------------------------------------------------"
1818
echo "Physical query plan for query Q$QUERY_NUM:"
1919
echo
20-
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "EXPLAIN $query"
20+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "EXPLAIN $query"
2121

2222
# Increment the query number
2323
QUERY_NUM=$((QUERY_NUM + 1))

starrocks/queries.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
SELECT get_json_string(data, 'commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
22
SELECT get_json_string(data, 'commit.collection') AS event, count() AS count, count(DISTINCT get_json_string(data, 'did')) AS users FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') GROUP BY event ORDER BY count DESC;
3-
SELECT get_json_string(data, 'commit.collection') AS event, hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4-
SELECT get_json_string(data, '$.did') as user_id, min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
5-
SELECT get_json_string(data, '$.did') as user_id, date_diff('millisecond', min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))), max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
3+
SELECT get_json_string(data, 'commit.collection') AS event, hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4+
SELECT get_json_string(data, 'did') as user_id, to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
5+
SELECT get_json_string(data, 'did') as user_id, date_diff('millisecond', to_datetime(min(get_json_int(data, 'time_us')), 6), to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;

starrocks/queries_formatted.sql

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
------------------------------------------------------------------------------------------------------------------------
44

55
SELECT get_json_string(data, 'commit.collection') AS event,
6-
count() AS count
7-
FROM bluesky
8-
GROUP BY event
6+
count() AS count
7+
FROM bluesky
8+
GROUP BY event
99
ORDER BY count DESC;
1010

1111
------------------------------------------------------------------------------------------------------------------------
@@ -17,33 +17,33 @@ SELECT
1717
count(DISTINCT get_json_string(data, 'did')) AS users
1818
FROM bluesky
1919
WHERE (get_json_string(data, 'kind') = 'commit')
20-
AND (get_json_string(data, 'commit.operation') = 'create')
20+
AND (get_json_string(data, 'commit.operation') = 'create')
2121
GROUP BY event
2222
ORDER BY count DESC;
2323

2424
------------------------------------------------------------------------------------------------------------------------
2525
-- Q3 - When do people use BlueSky
2626
------------------------------------------------------------------------------------------------------------------------
2727
SELECT
28-
get_json_string(data, 'commit.collection') AS event,
29-
hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day,
28+
get_json_string(data, 'commit.collection') AS event,
29+
hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day,
3030
count() AS count
3131
FROM bluesky
32-
WHERE (get_json_string(data, 'kind') = 'commit')
33-
AND (get_json_string(data, 'commit.operation') = 'create')
34-
AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection')))
32+
WHERE (get_json_string(data, 'kind') = 'commit')
33+
AND (get_json_string(data, 'commit.operation') = 'create')
34+
AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection')))
3535
GROUP BY event, hour_of_day
3636
ORDER BY hour_of_day, event;
3737

3838
------------------------------------------------------------------------------------------------------------------------
3939
-- Q4 - top 3 post veterans
4040
------------------------------------------------------------------------------------------------------------------------
4141
SELECT
42-
get_json_string(data, '$.did') as user_id,
43-
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date
42+
get_json_string(data, 'did') as user_id,
43+
to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date
4444
FROM bluesky
45-
WHERE (get_json_string(data, 'kind') = 'commit')
46-
AND (get_json_string(data, 'commit.operation') = 'create')
45+
WHERE (get_json_string(data, 'kind') = 'commit')
46+
AND (get_json_string(data, 'commit.operation') = 'create')
4747
AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post')
4848
GROUP BY user_id
4949
ORDER BY first_post_date ASC
@@ -53,13 +53,13 @@ LIMIT 3;
5353
-- Q5 - top 3 users with longest activity
5454
------------------------------------------------------------------------------------------------------------------------
5555
SELECT
56-
get_json_string(data, '$.did') as user_id,
57-
date_diff('millisecond',
58-
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))),
59-
max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span
56+
get_json_string(data, 'did') as user_id,
57+
date_diff('millisecond',
58+
to_datetime(min(get_json_int(data, 'time_us')), 6),
59+
to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span
6060
FROM bluesky
61-
WHERE (get_json_string(data, 'kind') = 'commit')
62-
AND (get_json_string(data, 'commit.operation') = 'create')
61+
WHERE (get_json_string(data, 'kind') = 'commit')
62+
AND (get_json_string(data, 'commit.operation') = 'create')
6363
AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post')
6464
GROUP BY user_id
6565
ORDER BY activity_span DESC

0 commit comments

Comments
 (0)