ClickHouse · cometkim · Oct 1, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/quickwit/.gitignore b/quickwit/.gitignore
@@ -0,0 +1 @@
+qwdata/
diff --git a/quickwit/config/index-config.yaml b/quickwit/config/index-config.yaml
@@ -0,0 +1,38 @@
+version: 0.8
+
+index_id: jsonbench
+
+doc_mapping:
+  mode: dynamic
+  dynamic_mapping:
+    indexed: true
+    stored: true
+    tokenizer: raw
+    expand_dots: true
+  field_mappings:
+    - name: did
+      type: text
+      tokenizer: raw
+      fast: true
+    - name: time_us
+      type: datetime
+      input_formats: [unix_timestamp]
+      output_format: unix_timestamp_micros
+      fast: true
+      fast_precision: microseconds
+    - name: kind
+      type: text
+      tokenizer: raw
+      fast: true
+    - name: commit
+      type: object
+      field_mappings:
+        - name: operation
+          type: text
+          tokenizer: raw
+          fast: true
+        - name: collection
+          type: text
+          tokenizer: raw
+          fast: true
+  timestamp_field: time_us
diff --git a/quickwit/config/quickwit.yaml b/quickwit/config/quickwit.yaml
@@ -0,0 +1,7 @@
+version: 0.8
+
+listen_address: 0.0.0.0
+
+searcher:
+  aggregation_memory_limit: 64G
+  request_timeout_secs: 300
diff --git a/quickwit/count.sh b/quickwit/count.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+curl -s --fail http://localhost:7280/api/v1/indexes/jsonbench/describe \
+    | jq ".num_published_docs"
diff --git a/quickwit/data_size.sh b/quickwit/data_size.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+curl -s --fail http://localhost:7280/api/v1/indexes/jsonbench/describe \
+    | jq ".size_published_splits"
diff --git a/quickwit/drop_tables.sh b/quickwit/drop_tables.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+echo "Stopping Quickwit"
+pidof quickwit && kill $(pidof quickwit)
+
+echo "Dropping all data"
+rm -rf ./qwdata
diff --git a/quickwit/install.sh b/quickwit/install.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# The latest official release of Quickwit is too old, many unsupported tantivy quries.
+# They publish edge build as Docker images to Docker Hub. We can extract the binary from that images.
+#
+# It will be replaced by the official release when it is updated.
+#
+# RELEASE_VERSION=v0.9.0
+# wget -N "https://github.com/quickwit-oss/quickwit/releases/download/${RELEASE_VERSION}/quickwit-${RELEASE_VERSION}-x86_64-unknown-linux-gnu.tar.gz"
+# tar xzf quickwit-${RELEASE_VERSION}-x86_64-unknown-linux-gnu.tar.gz
+# mv quickwit-${RELEASE_VERSION}/quickwit ./
+# rm -rf quickwit-${RELEASE_VERSION} 
+#
+# Using prebuilt binary here for testing
+PREBUILT_NAME=quickwit-f6cb417-x86_64-unknown-linux-gnu
+wget -N "https://github-actions-assets.cometkim.dev/prebuilt/$PREBUILT_NAME"
+mv "$PREBUILT_NAME" ./quickwit
+chmod +x ./quickwit
diff --git a/quickwit/load_data.sh b/quickwit/load_data.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Check if the required arguments are provided
+if [[ $# -lt 4 ]]; then
+    echo "Usage: $0 <DATA_DIRECTORY> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
+    exit 1
+fi
+
+# Arguments
+DATA_DIRECTORY="$1"
+MAX_FILES="$2"
+SUCCESS_LOG="$3"
+ERROR_LOG="$4"
+
+# Validate arguments
+[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
+[[ ! "$MAX_FILES" =~ ^[0-9]+$ ]] && { echo "Error: MAX_FILES must be a positive integer."; exit 1; }
+
+# Absolute path of Quickwit executable
+QW_CMD="$PWD/quickwit"
+
+echo "Prepare clean index: jsonbench"
+./quickwit index create --index-config ./config/index-config.yaml --overwrite --yes
+
+pushd $DATA_DIRECTORY
+counter=0
+for file in $(ls *.json.gz | head -n $MAX_FILES); do
+    gunzip -c "$file"
+
+    counter=$((counter + 1))
+    if [[ $counter -ge $MAX_FILES ]]; then
+        break
+    fi
+done | $QW_CMD tool local-ingest --index jsonbench
+popd
+
+# See https://github.com/quickwit-oss/quickwit/issues/4869
+echo "Wait 1 min for Quickwit search become available"
+sleep 60
+
+./quickwit tool gc --index jsonbench
+
+echo -e "\nLoaded $MAX_FILES data files from $DATA_DIRECTORY to Quickwit."
diff --git a/quickwit/main.sh b/quickwit/main.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+DEFAULT_CHOICE=ask
+DEFAULT_DATA_DIRECTORY=~/data/bluesky
+
+# Allow the user to optionally provide the scale factor ("choice") as an argument
+CHOICE="${1:-$DEFAULT_CHOICE}"
+
+# Allow the user to optionally provide the data directory as an argument
+DATA_DIRECTORY="${2:-$DEFAULT_DATA_DIRECTORY}"
+
+# Define success and error log files
+SUCCESS_LOG="${3:-success.log}"
+ERROR_LOG="${4:-error.log}"
+
+# Define prefix for output files
+OUTPUT_PREFIX="${5:-_m6i.8xlarge}"
+
+# Check if the directory exists
+if [[ ! -d "$DATA_DIRECTORY" ]]; then
+    echo "Error: Data directory '$DATA_DIRECTORY' does not exist."
+    exit 1
+fi
+
+if [ "$CHOICE" = "ask" ]; then
+    echo "Select the dataset size to benchmark:"
+    echo "1) 1m (default)"
+    echo "2) 10m"
+    echo "3) 100m"
+    echo "4) 1000m"
+    echo "5) all"
+    read -p "Enter the number corresponding to your choice: " CHOICE
+fi
+
+export QW_CONFIG="$PWD/config/quickwit.yaml"
+export QW_DATA_DIR="$PWD/qwdata"
+
+./install.sh
+
+benchmark() {
+    local size=$1
+    # Check DATA_DIRECTORY contains the required number of files to run the benchmark
+    file_count=$(find "$DATA_DIRECTORY" -type f | wc -l)
+    if (( file_count < size )); then
+        echo "Error: Not enough files in '$DATA_DIRECTORY'. Required: $size, Found: $file_count."
+        exit 1
+    fi
+
+    ./start.sh
+    ./load_data.sh "$DATA_DIRECTORY" "$size" "$SUCCESS_LOG" "$ERROR_LOG"
+    ./total_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.total_size"
+    ./data_size.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.data_size"
+    ./count.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.count"
+    #./query_results.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.query_results"
+    ./run_queries.sh | tee "${OUTPUT_PREFIX}_bluesky_${size}m.results_runtime"
+    ./drop_tables.sh
+}
+
+case $CHOICE in
+    2)
+        benchmark 10
+        ;;
+    3)
+        benchmark 100
+        ;;
+    4)
+        benchmark 1000
+        ;;
+    5)
+        benchmark 1
+        benchmark 10
+        benchmark 100
+        benchmark 1000
+        ;;
+    *)
+        benchmark 1
+        ;;
+esac
diff --git a/quickwit/queries.json5 b/quickwit/queries.json5
@@ -0,0 +1,5 @@
+{"query":"*","max_hits":0,"aggs":{"events":{"terms":{"field":"commit.collection","order":{"_count":"desc"},"size": 1000}}}}
+{"query":"kind:commit AND commit.operation:create","max_hits":0,"aggs":{"events":{"terms":{"field":"commit.collection","order":{"_count":"desc"},"size":1000},"aggs":{"users":{"cardinality":{"field":"did"}}}}}}
+{"query":"kind:commit AND commit.operation:create AND commit.collection:IN [app.bsky.feed.post app.bsky.feed.repost app.bsky.feed.like]","max_hits":0,"aggs":{"events":{"terms":{"field":"commit.collection","order":{"_key":"asc"},"size":1000},"aggs":{"hour_of_day":{"date_histogram":{"field":"time_us","fixed_interval":"1h"}}}}}}
+{"query":"kind:commit AND commit.operation:create AND commit.collection:app.bsky.feed.post","max_hits":0,"aggs":{"users":{"terms":{"field":"did","order":{"first_post":"asc"},"size":3},"aggs":{"first_post":{"min":{"field":"time_us"}}}}}}
+{"query":"kind:commit AND commit.operation:create AND commit.collection:app.bsky.feed.post","max_hits":0,"aggs":{"users":{"terms":{"field":"did","order":{"activity_span.max":"desc"},"size":3},"aggs":{"activity_span":{"stats":{"field":"time_us"}}}}}}
diff --git a/quickwit/queries_formatted.json5 b/quickwit/queries_formatted.json5
@@ -0,0 +1,116 @@
+/**
+ * Q1 - Top event types
+ */
+{
+  "query": "*",
+  "max_hits": 0,
+  "aggs": {
+    "events": {
+      "terms": {
+        "field": "commit.collection",
+        "order": { "_count": "desc" },
+        "size": 1000
+      }
+    }
+  }
+}
+
+/**
+ * Q2 - Top event types together with unique users per event type
+ */
+{
+  "query": "kind:commit AND commit.operation:create",
+  "max_hits": 0,
+  "aggs": {
+    "events": {
+      "terms": {
+        "field": "commit.collection",
+        "order": { "_count": "desc" },
+        "size": 1000
+      },
+      "aggs": {
+        "users": {
+          "cardinality": {
+            "field": "did"
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * Q3 - When do people use BlueSky
+ */
+{
+  "query": "kind:commit AND commit.operation:create AND commit.collection:IN [app.bsky.feed.post app.bsky.feed.repost app.bsky.feed.like]",
+  "max_hits": 0,
+  "aggs": {
+    "events": {
+      "terms": {
+        "field": "commit.collection",
+        "order": { "_key": "asc" },
+        "size": 1000
+      },
+      "aggs": {
+        "hour_of_day": {
+          "date_histogram": {
+            "field": "time_us",
+            "fixed_interval": "1h"
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * Q4 - top 3 post veterans
+ */
+{
+  "query": "kind:commit AND commit.operation:create AND commit.collection:app.bsky.feed.post",
+  "max_hits": 0,
+  "aggs": {
+    "users": {
+      "terms": {
+        "field": "did",
+        "order": { "first_post": "asc" },
+        "size": 3
+      },
+      "aggs": {
+        "first_post_ts": {
+          "min": {
+            "field": "time_us"
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * Q5 - top 3 users with longest activity
+ *
+ * Not completely supported.
+ * Tantivy need to support `bucket_script` aggregation to allow this query.
+ */
+{
+  "query": "kind:commit AND commit.operation:create AND commit.collection:app.bsky.feed.post",
+  "max_hits": 0,
+  "aggs": {
+    "users": {
+      "terms": {
+        "field": "did",
+        "order": { "activity_span.max": "desc" },
+        "size": 3
+      },
+      "aggs": {
+        "activity_span": {
+          "stats": {
+            "field": "time_us"
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/quickwit/query_results.sh b/quickwit/query_results.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+QUERY_NUM=1
+
+cat queries.json5 | while read -r query; do
+
+    # Print the query
+    echo "------------------------------------------------------------------------------------------------------------------------"
+    echo "Result for query Q$QUERY_NUM:"
+    echo
+
+    curl -s --fail -X "POST" \
+        "http://localhost:7280/api/v1/jsonbench/search" \
+        -H "Accept: application/json" \
+        -H "Content-Type: application/json" \
+        -d "$query" | jq ".aggregations"
+
+    # Increment the query number
+    QUERY_NUM=$((QUERY_NUM + 1))
+done;