Add transform benchmark (#254)

tellet-q · pre-commit-ci[bot] · web-flow · commit 93bd42147615 · 2025-05-28T18:28:52.000+02:00
* Add transform benchmark * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/.github/workflows/continuous-benchmark-hnsw.yaml b/.github/workflows/continuous-benchmark-hnsw.yaml
@@ -26,8 +26,28 @@ jobs:
         with:
           hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
           db_host: ${{ secrets.POSTGRES_HOST }}
-          server_name: "benchmark-server-3"
+          server_name: "benchmark-server-1"
       - name: Run bench
         id: hnsw-indexing-update
         run: |
           cd ansible/playbooks && ansible-playbook playbook-hnsw-index.yml --extra-vars "bench=update"
+
+  runTransformHealingBenchmark:
+    runs-on: ubuntu-latest
+    container: alpine/ansible:2.18.1
+    needs: runUpdateHealingBenchmark
+    steps:
+      - uses: actions/checkout@v3
+      - uses: webfactory/ssh-agent@v0.8.0
+        with:
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+      - name: Create inventory
+        uses: ./.github/workflows/actions/create-inventory
+        with:
+          hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
+          db_host: ${{ secrets.POSTGRES_HOST }}
+          server_name: "benchmark-server-1"
+      - name: Run bench
+        id: hnsw-indexing-transform
+        run: |
+            cd ansible/playbooks && ansible-playbook playbook-hnsw-index.yml --extra-vars "bench=transform"
diff --git a/ansible/README.md b/ansible/README.md
@@ -45,4 +45,11 @@ benchmark-db ansible_host=${YOUR_DB_SERVER_IP} ansible_user=${YOUR_DB_SERVER_USE
 Then from [ansible/playbooks](playbooks) run:
 ```bash
 ansible-playbook playbook-hnsw-index.yml --extra-vars "bench=update"
-```
+```
+
+## How to add a new benchmark
+
+* Create a new playbook in the [ansible/playbooks](playbooks) directory (i.e `playbook-hnsw-index.yml`). The playbook defines which role to run on which machine (i.e run `run-hnsw-indexing-update` on machines of `remote_machines` group).
+* Add a new folder in [ansible/playbooks/roles](playbooks/roles) (i.e `run-hnsw-indexing-update`) with 2 sub-folders `tasks` (required) and `files` (optional).  Add `main.yml` in `tasks` folder. The role defines tasks (`main.yml`) required to run the benchmark. For example, copying scripts, setting up benchmark server, running the benchmark.
+* Optionally in the [ansible/playbooks/group_vars](playbooks/group_vars) directory add a new yml file to define variables specific for the role (i.e `hnsw-indexing-update.yml`). Variables that are shared can also be defined here (i.e in `common_vars.yml`).
+* Optionally in the [ansible/playbooks/files](playbooks/files) directory add files that are common across several roles and/or playbooks.
diff --git a/ansible/playbooks/files/hnsw-indexing/get_score.py b/ansible/playbooks/files/hnsw-indexing/get_score.py
diff --git a/ansible/playbooks/group_vars/hnsw-indexing-transform.yml b/ansible/playbooks/group_vars/hnsw-indexing-transform.yml
@@ -0,0 +1,19 @@
+qdrant_python_client_version: "1.14.0"
+logging_dir: "/tmp/logs"
+working_dir: "/tmp/experiments"
+dataset_url: "https://storage.googleapis.com/ann-filtered-benchmark/datasets/laion-small-clip-no-filters-1.tgz"
+dataset_name: "laion-small-clip-no-filters-1"
+dataset_dim: "512"
+dataset_2_url: "https://storage.googleapis.com/ann-filtered-benchmark/datasets/laion-small-clip-no-filters-2.tgz"
+dataset_2_name: "laion-small-clip-no-filters-2"
+servers:
+  - name: "qdrant"
+    registry: "ghcr.io"
+    image: "qdrant/qdrant"
+    version: "dev"
+    feature_flags: "true"
+  - name: "qdrant"
+    registry: "docker.io"
+    image: "qdrant/qdrant"
+    version: "master"
+    feature_flags: "false"
diff --git a/ansible/playbooks/roles/run-hnsw-indexing-common/tasks/main.yml b/ansible/playbooks/roles/run-hnsw-indexing-common/tasks/main.yml
@@ -10,7 +10,7 @@
     - { src: "{{ bench }}.py", dest: "{{ working_dir }}/{{ bench }}.py" }
     - { src: "../../files/hnsw-indexing/requirements.txt", dest: "{{ working_dir }}/requirements.txt" }
     - { src: "../../files/hnsw-indexing/docker-compose.yaml", dest: "{{ working_dir }}/docker-compose.yaml" }
-    - { src: get_score.py, dest: "{{ working_dir }}/get_score.py" }
+    - { src: "../../files/hnsw-indexing/get_score.py", dest: "{{ working_dir }}/get_score.py" }
 
 - name: Start Docker container on the remote machine
   ansible.builtin.shell: |
@@ -31,7 +31,7 @@
 - name: "Execute the script on the remote machine: {{ server_name }}-{{ server_version }}"
   ansible.builtin.shell: |
     {{ working_dir }}/run-bench.sh > "{{ working_dir }}/log-{{ server_name }}-{{ server_version }}-{{ bench }}.log" 2>&1
-  async: 3600 # 60 minutes
+  async: 7200 # 120 minutes
   poll: 30 # Check every 30 seconds
   environment:
     OUTPUT_FILENAME: "{{ working_dir }}/output-{{ server_name }}-{{ server_version }}-{{ bench }}.json"
diff --git a/ansible/playbooks/roles/run-hnsw-indexing-transform/files/run-bench.sh b/ansible/playbooks/roles/run-hnsw-indexing-transform/files/run-bench.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# This script is used to set up a virtual environment, install dependencies, and run a Python script.
+
+set -euo pipefail
+
+if [ -z "${DATASET_DIM:-}" ]; then
+    echo "Error: DATASET_DIM is not set"
+    exit 1
+fi
+
+if [ -z "${BENCH:-}" ]; then
+    echo "Error: BENCH is not set"
+    exit 2
+fi
+
+if [ -z "${DATASET_NAME:-}" ]; then
+    echo "Error: DATASET_NAME is not set"
+    exit 3
+fi
+
+if [ -z "${DATASET_NAME_2:-}" ]; then
+    echo "Error: DATASET_NAME_2 is not set"
+    exit 4
+fi
+
+if [ -z "${OUTPUT_FILENAME:-}" ]; then
+    echo "Error: OUTPUT_FILENAME is not set"
+    exit 5
+fi
+
+if [ -z "${WORK_DIR:-}" ]; then
+    WORK_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    echo "Warn: WORK_DIR is not set, defaults to script's ${WORK_DIR}"
+fi
+
+cd "${WORK_DIR}"
+
+# Check if venv exists
+if [ ! -d "${WORK_DIR}/venv" ]; then
+    echo "Creating virtual environment..."
+    python3 -m venv "${WORK_DIR}/venv"
+    source "${WORK_DIR}/venv/bin/activate"
+
+    echo "Installing requirements..."
+    pip install -r "${WORK_DIR}/requirements.txt"
+
+    deactivate
+else
+    echo "Virtual environment already exists. Skipping setup."
+fi
+
+echo "Activating virtual environment..."
+source "${WORK_DIR}/venv/bin/activate"
+
+NOW=$(date "+%Y-%m-%dT%H:%M:%SZ")
+echo "${NOW}"
+echo "Running..."
+python -u "${WORK_DIR}/${BENCH}.py"
+echo "Python script completed with exit code: $?"
+deactivate
+
+exit 0
diff --git a/ansible/playbooks/roles/run-hnsw-indexing-transform/files/transform.py b/ansible/playbooks/roles/run-hnsw-indexing-transform/files/transform.py
@@ -0,0 +1,210 @@
+"""
+Test Qdrant's accuracy in scenarios of continuous updates of real data.
+
+
+This script will:
+
+- Create a Qdrant collection, and make initial upload of all available vectors from `data/dataset1`
+- Measure the accuracy of the search
+- Start replacing vectors of collection by removing points and replacing them with new ones from `data/dataset2`
+- Once finished, measure the accuracy of the search
+
+"""
+
+import json
+import os
+import random
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+import numpy as np
+import tqdm
+from qdrant_client import QdrantClient, models
+
+QDRANT_COLLECTION_NAME = "benchmark"
+OUTPUT_FILENAME = os.getenv("OUTPUT_FILENAME", "output.json")
+DATASET_DIM = int(os.getenv("DATASET_DIM", 512))
+DATASET_NAME = os.getenv("DATASET_NAME", "laion-small-clip-no-filters-1")
+DATASET_NAME_2 = os.getenv("DATASET_NAME_2", "laion-small-clip-no-filters-2")
+DATA_DIR = Path(__file__).parent / "data" / DATASET_NAME
+DATA_DIR_2 = Path(__file__).parent / "data" / DATASET_NAME_2
+
+VECTORS_FILE_2 = DATA_DIR_2 / "vectors.npy"
+VECTORS_FILE_1 = DATA_DIR / "vectors.npy"
+
+TEST_DATA_FILE_2 = DATA_DIR_2 / "tests.jsonl"
+TEST_DATA_FILE_1 = DATA_DIR / "tests.jsonl"
+
+TOTAL_VECTORS = 100_000
+BATCH_SIZE = 500
+
+
+def read_test_data(file: Path, limit: int = 1000):
+    """
+    {
+        "query": [
+            0.022043373435735703,
+            -0.022230295464396477,
+            ....
+        ],
+        "closest_ids": [
+            43749,
+            43756,
+            ....
+        ]
+    }
+    """
+    with open(file, "r") as f:
+        for idx, line in enumerate(f):
+            if idx >= limit:
+                break
+
+            yield json.loads(line)
+
+
+class QdrantBenchmark:
+
+    def __init__(self, url):
+
+        client = QdrantClient(url=url, prefer_grpc=True)
+        self.client = client
+
+        self.client.delete_collection(QDRANT_COLLECTION_NAME)
+
+        self.collection = self.client.create_collection(
+            QDRANT_COLLECTION_NAME,
+            vectors_config=models.VectorParams(
+                size=DATASET_DIM,
+                distance=models.Distance.COSINE,
+            ),
+            optimizers_config=models.OptimizersConfigDiff(
+                deleted_threshold=0.001,
+                vacuum_min_vector_number=100,
+            ),
+        )
+
+    def initial_upload(self, vectors: np.ndarray):
+        self.client.upload_collection(
+            collection_name=QDRANT_COLLECTION_NAME,
+            vectors=vectors,
+            ids=range(len(vectors)),
+        )
+
+    def upload_points(self, vectors: np.ndarray, ids: list[int]):
+        points = [
+            models.PointStruct(id=idx, vector=vectors[idx].tolist()) for idx in ids
+        ]
+
+        self.client.upsert(
+            collection_name=QDRANT_COLLECTION_NAME,
+            points=points,
+        )
+
+    def validate_test_data(self, file: Path) -> float:
+        total_results = 0
+        matched_results = 0
+        for test in tqdm.tqdm(read_test_data(file), desc="Validating test data"):
+            query = test["query"]
+            closest_ids = set(test["closest_ids"])
+
+            results = self.client.query_points(
+                collection_name=QDRANT_COLLECTION_NAME,
+                query=query,
+                limit=len(closest_ids),
+            )
+
+            results_idx = set(obj.id for obj in results.points)
+
+            matched_results += len(closest_ids & results_idx)
+            total_results += len(closest_ids)
+
+        return matched_results / total_results
+
+    def delete_points(self, points_to_delete: set):
+        self.client.delete(
+            collection_name=QDRANT_COLLECTION_NAME,
+            points_selector=models.PointIdsList(
+                points=[idx for idx in points_to_delete]
+            ),
+        )
+
+    def wait_ready(self) -> float:
+        wait_interval = 0.2
+        confirmations_required = 2
+
+        start_time = time.time()
+        confirmations = 0
+        first_green_time: float | None = None
+
+        while True:
+            collection_info = self.client.get_collection(QDRANT_COLLECTION_NAME)
+            if collection_info.status == models.CollectionStatus.GREEN:
+                confirmations += 1
+                first_green_time = first_green_time or time.time()
+                if confirmations == confirmations_required:
+                    return first_green_time - start_time
+            else:
+                confirmations = 0
+                first_green_time = None
+            time.sleep(wait_interval)
+
+    def __del__(self):
+        self.client.close()
+
+
+def store_to_file(data_dict):
+    timestamped_dict = data_dict.copy()
+    timestamped_dict["timestamp"] = datetime.now().isoformat()
+
+    with open(OUTPUT_FILENAME, "w", encoding="utf-8") as f:
+        json.dump(timestamped_dict, f, ensure_ascii=False)
+
+
+def main():
+    result = {}
+    vectors_1 = np.load(VECTORS_FILE_1)
+    vectors_2 = np.load(VECTORS_FILE_2)
+
+    benchmark = QdrantBenchmark("http://localhost:6333")
+    benchmark.initial_upload(vectors_1)
+    benchmark.wait_ready()
+
+    initial_precision = benchmark.validate_test_data(TEST_DATA_FILE_1)
+    print("Precision dataset1: ", initial_precision)
+    result["initial_precision"] = initial_precision
+    result["precision_before_iteration"] = initial_precision
+
+    points_to_migrate = list(range(TOTAL_VECTORS))
+
+    random.shuffle(points_to_migrate)
+
+    total_indexing_time = 0
+    for i in tqdm.tqdm(range(0, len(points_to_migrate), BATCH_SIZE), desc="Iterating"):
+        batch = points_to_migrate[i : i + BATCH_SIZE]
+
+        benchmark.delete_points(set(batch))
+
+        benchmark.upload_points(vectors_2, batch)
+
+        total_indexing_time += benchmark.wait_ready()
+
+    print(f"Indexing: {total_indexing_time}")
+    result["indexing_total_time_s"] = total_indexing_time
+
+    precision_after_iteration = benchmark.validate_test_data(TEST_DATA_FILE_2)
+    print(f"Precision dataset2: {precision_after_iteration}")
+    result["precision_after_iteration"] = precision_after_iteration
+
+    store_to_file(result)
+
+
+if __name__ == "__main__":
+    sys.stdout.reconfigure(line_buffering=True)
+    sys.stderr.reconfigure(line_buffering=True)
+
+    main()
+
+    sys.stdout.flush()
+    sys.stderr.flush()
diff --git a/ansible/playbooks/roles/run-hnsw-indexing-transform/tasks/main.yml b/ansible/playbooks/roles/run-hnsw-indexing-transform/tasks/main.yml