Skip to content

Commit 2f27285

Browse files
authored
Merge pull request #219 from qdrant/feat/keep-datasets
feat/keep-datasets
2 parents 5ec825e + 2b45d53 commit 2f27285

File tree

4 files changed

+85
-0
lines changed

4 files changed

+85
-0
lines changed

.github/workflows/clean-datasets.yaml

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
name: Clean Datasets
2+
3+
on:
4+
repository_dispatch:
5+
workflow_dispatch:
6+
schedule:
7+
# Run every month on the 1st day at 3 am
8+
- cron: "0 3 1 * *"
9+
10+
concurrency:
11+
group: continuous-benchmark
12+
13+
# This removes the ci-datasets volume from client machine.
14+
# The next run of Continuous Benchmark will create the volume again and download all the datasets.
15+
jobs:
16+
removeDatasetsVolume:
17+
runs-on: ubuntu-latest
18+
steps:
19+
- uses: actions/checkout@v3
20+
- uses: webfactory/[email protected]
21+
with:
22+
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
23+
- name: Benches
24+
id: benches
25+
run: |
26+
export HCLOUD_TOKEN=${{ secrets.HCLOUD_TOKEN }}
27+
28+
set +e
29+
30+
timeout 10m bash -x tools/run_client_remove_volume.sh
31+
32+
set -e
33+
- name: Send Notification
34+
if: failure()
35+
uses: slackapi/[email protected]
36+
with:
37+
payload: |
38+
{
39+
"text": "Failed to remove the datasets volume (removeDatasetsVolume), run status: ${{ job.status }}",
40+
"blocks": [
41+
{
42+
"type": "section",
43+
"text": {
44+
"type": "mrkdwn",
45+
"text": "View the results <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|here>"
46+
}
47+
}
48+
]
49+
}
50+
env:
51+
SLACK_WEBHOOK_URL: ${{ secrets.CI_ALERTS_CHANNEL_WEBHOOK_URL }}
52+
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK

tools/run_client_remove_volume.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
3+
PS4='ts=$(date "+%Y-%m-%dT%H:%M:%SZ") level=DEBUG line=$LINENO file=$BASH_SOURCE '
4+
set -euo pipefail
5+
6+
CLOUD_NAME=${CLOUD_NAME:-"hetzner"}
7+
SERVER_USERNAME=${SERVER_USERNAME:-"root"}
8+
9+
SCRIPT=$(realpath "$0")
10+
SCRIPT_PATH=$(dirname "$SCRIPT")
11+
12+
BENCH_CLIENT_NAME=${CLIENT_NAME:-"benchmark-client-1"}
13+
14+
IP_OF_THE_CLIENT=$(bash "${SCRIPT_PATH}/${CLOUD_NAME}/get_public_ip.sh" "$BENCH_CLIENT_NAME")
15+
16+
echo "Remove ci-datasets volume from client"
17+
RUN_CMD="docker volume rm -f ci-datasets || true"
18+
19+
ssh -tt -o ServerAliveInterval=120 -o ServerAliveCountMax=10 "${SERVER_USERNAME}@${IP_OF_THE_CLIENT}" "${RUN_CMD}"

tools/run_client_script.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ if [[ "$EXPERIMENT_MODE" == "snapshot" ]]; then
3939

4040
else
4141
scp "${SCRIPT_PATH}/run_experiment.sh" "${SERVER_USERNAME}@${IP_OF_THE_CLIENT}:~/run_experiment.sh"
42+
scp "${SCRIPT_PATH}/../datasets/datasets.json" "${SERVER_USERNAME}@${IP_OF_THE_CLIENT}:~/datasets.json"
4243

4344
RUN_EXPERIMENT="ENGINE_NAME=${ENGINE_NAME} \
4445
DATASETS=${DATASETS} \

tools/run_experiment.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,23 @@ if [[ "$EXPERIMENT_MODE" != "snapshot" ]]; then
4545
docker rmi --force qdrant/vector-db-benchmark:latest || true
4646
fi
4747

48+
echo "Ensure datasets volume exists and contains latest datasets.json"
49+
docker volume create ci-datasets
50+
if [[ -f "$HOME/datasets.json" ]]; then
51+
echo "Found datasets.json, update the volume"
52+
mv ~/datasets.json "$(docker volume inspect ci-datasets -f '{{ .Mountpoint }}')"
53+
else
54+
echo "datasets.json is missing, do not update the volume"
55+
fi
56+
4857
if [[ "$EXPERIMENT_MODE" == "full" ]] || [[ "$EXPERIMENT_MODE" == "upload" ]]; then
4958
echo "EXPERIMENT_MODE=$EXPERIMENT_MODE"
5059
docker run \
5160
--rm \
5261
-it \
5362
--name ci-benchmark-upload \
5463
-v "$HOME/results:/code/results" \
64+
-v "ci-datasets:/code/datasets" \
5565
qdrant/vector-db-benchmark:latest \
5666
python run.py --engines "${ENGINE_NAME}" --datasets "${DATASETS}" --host "${PRIVATE_IP_OF_THE_SERVER}" --no-skip-if-exists --skip-search
5767
fi
@@ -70,6 +80,7 @@ if [[ "$EXPERIMENT_MODE" == "full" ]] || [[ "$EXPERIMENT_MODE" == "search" ]]; t
7080
-it \
7181
--name ci-benchmark-search \
7282
-v "$HOME/results:/code/results" \
83+
-v "ci-datasets:/code/datasets" \
7384
qdrant/vector-db-benchmark:latest \
7485
python run.py --engines "${ENGINE_NAME}" --datasets "${DATASETS}" --host "${PRIVATE_IP_OF_THE_SERVER}" --no-skip-if-exists --skip-upload
7586
fi
@@ -85,6 +96,7 @@ if [[ "$EXPERIMENT_MODE" == "parallel" ]]; then
8596
--rm \
8697
--name ci-benchmark-upload \
8798
-v "$HOME/results/parallel:/code/results" \
99+
-v "ci-datasets:/code/datasets" \
88100
qdrant/vector-db-benchmark:latest \
89101
python run.py --engines "${ENGINE_NAME}" --datasets "${DATASETS}" --host "${PRIVATE_IP_OF_THE_SERVER}" --no-skip-if-exists --skip-search --skip-configure &
90102
UPLOAD_PID=$!
@@ -94,6 +106,7 @@ if [[ "$EXPERIMENT_MODE" == "parallel" ]]; then
94106
--rm \
95107
--name ci-benchmark-search \
96108
-v "$HOME/results/parallel:/code/results" \
109+
-v "ci-datasets:/code/datasets" \
97110
qdrant/vector-db-benchmark:latest \
98111
python run.py --engines "${ENGINE_NAME}" --datasets "${DATASETS}" --host "${PRIVATE_IP_OF_THE_SERVER}" --no-skip-if-exists --skip-upload &
99112
SEARCH_PID=$!

0 commit comments

Comments
 (0)