Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions quickwit/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
qwdata/
38 changes: 38 additions & 0 deletions quickwit/config/index-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
version: 0.8

index_id: jsonbench

doc_mapping:
mode: dynamic
dynamic_mapping:
indexed: true
stored: true
tokenizer: raw
expand_dots: true
field_mappings:
- name: did
type: text
tokenizer: raw
fast: true
- name: time_us
type: datetime
input_formats: [unix_timestamp]
output_format: unix_timestamp_micros
fast: true
fast_precision: microseconds
- name: kind
type: text
tokenizer: raw
fast: true
- name: commit
type: object
field_mappings:
- name: operation
type: text
tokenizer: raw
fast: true
- name: collection
type: text
tokenizer: raw
fast: true
timestamp_field: time_us
7 changes: 7 additions & 0 deletions quickwit/config/quickwit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
version: 0.8

listen_address: 0.0.0.0

searcher:
aggregation_memory_limit: 64G
request_timeout_secs: 300
4 changes: 4 additions & 0 deletions quickwit/count.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

curl -s --fail http://localhost:7280/api/v1/indexes/jsonbench/describe \
| jq ".num_published_docs"
4 changes: 4 additions & 0 deletions quickwit/data_size.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

curl -s --fail http://localhost:7280/api/v1/indexes/jsonbench/describe \
| jq ".size_published_splits"
7 changes: 7 additions & 0 deletions quickwit/drop_tables.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

echo "Stopping Quickwit"
pidof quickwit && kill $(pidof quickwit)

echo "Dropping all data"
rm -rf ./qwdata
18 changes: 18 additions & 0 deletions quickwit/install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# The latest official release of Quickwit is too old, many unsupported tantivy quries.
Copy link
Member

@rschu1ze rschu1ze Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So what stops us from using the latest and greatest Docker builds?

Quickwit hasn't released a new version long time, and many people actually use a nightly build. I used a prebuilt binary here to avoid running Docker, but we can request a new binary release from the Quickwit team before merging this PR.

EDIT: Using Docker is fine, see the starrocks and singlestore submissions in this repository.

# They publish edge build as Docker images to Docker Hub. We can extract the binary from that images.
#
# It will be replaced by the official release when it is updated.
#
# RELEASE_VERSION=v0.9.0
# wget -N "https://github.com/quickwit-oss/quickwit/releases/download/${RELEASE_VERSION}/quickwit-${RELEASE_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
# tar xzf quickwit-${RELEASE_VERSION}-x86_64-unknown-linux-gnu.tar.gz
# mv quickwit-${RELEASE_VERSION}/quickwit ./
# rm -rf quickwit-${RELEASE_VERSION}
#
# Using prebuilt binary here for testing
PREBUILT_NAME=quickwit-f6cb417-x86_64-unknown-linux-gnu
wget -N "https://github-actions-assets.cometkim.dev/prebuilt/$PREBUILT_NAME"
mv "$PREBUILT_NAME" ./quickwit
chmod +x ./quickwit
43 changes: 43 additions & 0 deletions quickwit/load_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

# Check if the required arguments are provided
if [[ $# -lt 4 ]]; then
echo "Usage: $0 <DATA_DIRECTORY> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
exit 1
fi

# Arguments
DATA_DIRECTORY="$1"
MAX_FILES="$2"
SUCCESS_LOG="$3"
ERROR_LOG="$4"

# Validate arguments
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }

# Absolute path of Quickwit executable
QW_CMD="$PWD/quickwit"

echo "Prepare clean index: jsonbench"
./quickwit index create --index-config ./config/index-config.yaml --overwrite --yes

pushd $DATA_DIRECTORY
counter=0
for file in $(ls *.json.gz | head -n $MAX_FILES); do
gunzip -c "$file"

counter=$((counter + 1))
if [[ $counter -ge $MAX_FILES ]]; then
break
fi
done | $QW_CMD tool local-ingest --index jsonbench
popd

# See https://github.com/quickwit-oss/quickwit/issues/4869
echo "Wait 1 min for Quickwit search become available"
sleep 60

./quickwit tool gc --index jsonbench

echo -e "\nLoaded $MAX_FILES data files from $DATA_DIRECTORY to Quickwit."
78 changes: 78 additions & 0 deletions quickwit/main.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/bin/bash

DEFAULT_CHOICE=ask
DEFAULT_DATA_DIRECTORY=~/data/bluesky

# Allow the user to optionally provide the scale factor ("choice") as an argument
CHOICE="${1:-$DEFAULT_CHOICE}"

# Allow the user to optionally provide the data directory as an argument
DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"

# Define success and error log files
SUCCESS_LOG="${3:-success.log}"
ERROR_LOG="${4:-error.log}"

# Define prefix for output files
OUTPUT_PREFIX="${5:-_m6i.8xlarge}"

# Check if the directory exists
if [[ ! -d "$DATA_DIRECTORY" ]]; then
echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
exit 1
fi

if [ "$CHOICE" = "ask" ]; then
echo "Select the dataset size to benchmark:"
echo "1) 1m (default)"
echo "2) 10m"
echo "3) 100m"
echo "4) 1000m"
echo "5) all"
read -p "Enter the number corresponding to your choice: " CHOICE
fi

export QW_CONFIG="$PWD/config/quickwit.yaml"
export QW_DATA_DIR="$PWD/qwdata"

./install.sh

benchmark() {
local size=$1
# Check DATA_DIRECTORY contains the required number of files to run the benchmark
file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
if (( file_count < size )); then
echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
exit 1
fi

./start.sh
./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
#./query_results.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
./drop_tables.sh
}

case $CHOICE in
2)
benchmark 10
;;
3)
benchmark 100
;;
4)
benchmark 1000
;;
5)
benchmark 1
benchmark 10
benchmark 100
benchmark 1000
;;
*)
benchmark 1
;;
esac
5 changes: 5 additions & 0 deletions quickwit/queries.json5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"query":"*","max_hits":0,"aggs":{"events":{"terms":{"field":"commit.collection","order":{"_count":"desc"},"size": 1000}}}}
{"query":"kind:commit AND commit.operation:create","max_hits":0,"aggs":{"events":{"terms":{"field":"commit.collection","order":{"_count":"desc"},"size":1000},"aggs":{"users":{"cardinality":{"field":"did"}}}}}}
{"query":"kind:commit AND commit.operation:create AND commit.collection:IN [app.bsky.feed.post app.bsky.feed.repost app.bsky.feed.like]","max_hits":0,"aggs":{"events":{"terms":{"field":"commit.collection","order":{"_key":"asc"},"size":1000},"aggs":{"hour_of_day":{"date_histogram":{"field":"time_us","fixed_interval":"1h"}}}}}}
{"query":"kind:commit AND commit.operation:create AND commit.collection:app.bsky.feed.post","max_hits":0,"aggs":{"users":{"terms":{"field":"did","order":{"first_post":"asc"},"size":3},"aggs":{"first_post":{"min":{"field":"time_us"}}}}}}
{"query":"kind:commit AND commit.operation:create AND commit.collection:app.bsky.feed.post","max_hits":0,"aggs":{"users":{"terms":{"field":"did","order":{"activity_span.max":"desc"},"size":3},"aggs":{"activity_span":{"stats":{"field":"time_us"}}}}}}
116 changes: 116 additions & 0 deletions quickwit/queries_formatted.json5
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/**
* Q1 - Top event types
*/
{
"query": "*",
"max_hits": 0,
"aggs": {
"events": {
"terms": {
"field": "commit.collection",
"order": { "_count": "desc" },
"size": 1000
}
}
}
}

/**
* Q2 - Top event types together with unique users per event type
*/
{
"query": "kind:commit AND commit.operation:create",
"max_hits": 0,
"aggs": {
"events": {
"terms": {
"field": "commit.collection",
"order": { "_count": "desc" },
"size": 1000
},
"aggs": {
"users": {
"cardinality": {
"field": "did"
}
}
}
}
}
}

/**
* Q3 - When do people use BlueSky
*/
{
"query": "kind:commit AND commit.operation:create AND commit.collection:IN [app.bsky.feed.post app.bsky.feed.repost app.bsky.feed.like]",
"max_hits": 0,
"aggs": {
"events": {
"terms": {
"field": "commit.collection",
"order": { "_key": "asc" },
"size": 1000
},
"aggs": {
"hour_of_day": {
"date_histogram": {
"field": "time_us",
"fixed_interval": "1h"
}
}
}
}
}
}

/**
* Q4 - top 3 post veterans
*/
{
"query": "kind:commit AND commit.operation:create AND commit.collection:app.bsky.feed.post",
"max_hits": 0,
"aggs": {
"users": {
"terms": {
"field": "did",
"order": { "first_post": "asc" },
"size": 3
},
"aggs": {
"first_post_ts": {
"min": {
"field": "time_us"
}
}
}
}
}
}

/**
* Q5 - top 3 users with longest activity
*
* Not completely supported.
* Tantivy need to support `bucket_script` aggregation to allow this query.
*/
{
"query": "kind:commit AND commit.operation:create AND commit.collection:app.bsky.feed.post",
"max_hits": 0,
"aggs": {
"users": {
"terms": {
"field": "did",
"order": { "activity_span.max": "desc" },
"size": 3
},
"aggs": {
"activity_span": {
"stats": {
"field": "time_us"
}
}
}
}
}
}
20 changes: 20 additions & 0 deletions quickwit/query_results.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

QUERY_NUM=1

cat queries.json5 | while read -r query; do

# Print the query
echo "------------------------------------------------------------------------------------------------------------------------"
echo "Result for query Q$QUERY_NUM:"
echo

curl -s --fail -X "POST" \
"http://localhost:7280/api/v1/jsonbench/search" \
-H "Accept: application/json" \
-H "Content-Type: application/json" \
-d "$query" | jq ".aggregations"

# Increment the query number
QUERY_NUM=$((QUERY_NUM + 1))
done;
Loading