Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions .github/workflows/run-crasher.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Crash

on:
push:
branches: [ "master" ]
branches: [ "*" ]
schedule:
# every day at 12 CET
- cron: "0 14 * * *"
Expand All @@ -14,10 +14,6 @@ on:
crashing_time:
description: "Duration in seconds, default is 3600s (1h)"
default: 3600
backup_working_dir:
description: "Backup working dir before each Qdrant restart"
type: boolean
default: false

env:
CARGO_TERM_COLOR: always
Expand Down Expand Up @@ -89,9 +85,9 @@ jobs:
cp -r qdrant-src/config qdrant

crashing_time="${{ steps.default_inputs.outputs.crashing_time }}"
backup_working_dir="${{ inputs.backup_working_dir && 'qdrant-backup' }}"
backup_storage_dir="${{ format('{0}/qdrant/storage-backup', github.workspace) }}"

./crash-things.sh qdrant ../qdrant-src/target/debug/qdrant 0.3 "$crashing_time" ${backup_working_dir:+"$backup_working_dir"}
./crash-things.sh qdrant ../qdrant-src/target/debug/qdrant 0.3 "$crashing_time" "$backup_storage_dir"
- name: Upload logs on failure
uses: actions/upload-artifact@v4
if: failure() || cancelled()
Expand All @@ -109,7 +105,6 @@ jobs:
retention-days: 10
path: |
qdrant/
qdrant-backup/
- name: Send Notification
if: failure() || cancelled()
uses: slackapi/[email protected]
Expand Down
4 changes: 2 additions & 2 deletions crash-things.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ QDRANT_DIR=${1:-./qdrant/}
QDRANT_EXEC=${2:-target/debug/qdrant}
CRASH_PROBABILITY=${3:-0.3}
RUN_TIME=${4:-300}
QDRANT_BACKUP_DIRS=( "${@:5}" )
QDRANT_BACKUP_DIR=${5:?backup directory is required}

CRASHER_LOG=crasher.log
QDRANT_LOG=../qdrant.log
Expand All @@ -15,7 +15,7 @@ CRASHER_CMD=(
cargo run --release
--
--working-dir "$QDRANT_DIR"
${QDRANT_BACKUP_DIRS[@]/#/--backup-working-dir } # this does not handle spaces 😬
--storage-backup $QDRANT_BACKUP_DIR
--exec-path "$QDRANT_EXEC"
--crash-probability "$CRASH_PROBABILITY"
)
Expand Down
4 changes: 2 additions & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ Usage: crasher [OPTIONS] --working-dir <WORKING_DIR> --exec-path <EXEC_PATH>
Options:
--working-dir <WORKING_DIR>
Working directory for Qdrant data
--backup-working-dir <BACKUP_WORKING_DIR>
Backup working directory between Qdrant restarts (useful to debug storage recovery issues)
--storage-backup <STORAGE_BACKUP>
Backup `storage` directory from `working_dir` between Qdrant restarts (useful to debug storage recovery issues)
--exec-path <EXEC_PATH>
Path to executable binary relative to `working_dir`
--crash-probability <CRASH_PROBABILITY>
Expand Down
4 changes: 2 additions & 2 deletions src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ pub struct Args {
/// Working directory for Qdrant data
#[arg(long)]
pub working_dir: String,
/// Backup working directory between Qdrant restarts (useful to debug storage recovery issues)
/// Backup `storage` directory from `working_dir` between Qdrant restarts (useful to debug storage recovery issues)
#[arg(long)]
pub backup_working_dir: Vec<String>,
pub storage_backup: Option<String>,
/// Path to executable binary relative to `working_dir`
#[arg(long)]
pub exec_path: String,
Expand Down
2 changes: 1 addition & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ async fn main() {
let (rng_seed, mut workload_rng, mut chaos_rng) = create_rngs(args.rng_seed);

// workload task
let workload = Workload::new(
let workload: Workload = Workload::new(
collection_name,
stopped.clone(),
crash_lock.clone(),
Expand Down
47 changes: 23 additions & 24 deletions src/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use crate::util;
use anyhow::Context as _;
use qdrant_client::Qdrant;
use rand::Rng;
use std::collections::VecDeque;
use std::io;
use std::path::PathBuf;
use std::process::exit;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
Expand Down Expand Up @@ -50,21 +50,24 @@ pub fn start_process(
pub struct ProcessManager {
pub working_dir: String,
pub binary_path: String,
pub backup_dirs: VecDeque<String>,
pub backup_dir: Option<String>,
pub child_process: Child,
pub kill_on_drop: bool,
pub cpu_quota: Option<u32>,
}

impl ProcessManager {
pub fn from_args(args: &Args) -> io::Result<Self> {
let manager = Self::new(
let mut manager = Self::new(
&args.working_dir,
&args.exec_path,
args.shutdown_on_error,
args.cpu_quota,
)?
.with_backup_dirs(args.backup_working_dir.clone());
)?;

if let Some(storage_backup) = &args.storage_backup {
manager = manager.with_backup_dirs(storage_backup);
}

Ok(manager)
}
Expand All @@ -80,15 +83,15 @@ impl ProcessManager {
Ok(Self {
working_dir: working_dir.to_string(),
binary_path: binary_path.to_string(),
backup_dirs: VecDeque::new(),
backup_dir: None,
child_process: child,
kill_on_drop,
cpu_quota,
})
}

pub fn with_backup_dirs(mut self, backup_dirs: impl Into<VecDeque<String>>) -> Self {
self.backup_dirs = backup_dirs.into();
pub fn with_backup_dirs(mut self, backup_dir: impl Into<String>) -> Self {
self.backup_dir = Some(backup_dir.into());
self
}

Expand All @@ -97,25 +100,27 @@ impl ProcessManager {
self.child_process.kill().await.unwrap();
}

pub async fn backup_working_dir(&mut self) -> anyhow::Result<()> {
let Some(backup_dir) = self.backup_dirs.front() else {
pub async fn backup_storage_dir(&mut self) -> anyhow::Result<()> {
let Some(backup_dir) = &self.backup_dir else {
return Ok(());
};

let backup_exists = fs::try_exists(backup_dir).await.with_context(|| {
format!("failed to query if backup working dir {backup_dir} exists")
format!("failed to query if backup storage dir {backup_dir} exists")
})?;

let backup_dir = PathBuf::from(backup_dir);
let backup_dir_path = backup_dir.as_path();

let source_storage_dir = PathBuf::from(&self.working_dir).join("storage");

if backup_exists {
fs::remove_dir_all(backup_dir)
fs::remove_dir_all(backup_dir_path)
.await
.with_context(|| format!("failed to remove backup working dir {backup_dir}"))?;
.with_context(|| format!("failed to remove backup storage dir {backup_dir_path:?}"))?;
}

util::copy_dir(&self.working_dir, backup_dir).await?;

let backup_dir = self.backup_dirs.pop_front().expect("backup dir");
self.backup_dirs.push_back(backup_dir);
util::copy_dir(&source_storage_dir, backup_dir_path).await?;

Ok(())
}
Expand All @@ -141,13 +146,7 @@ impl ProcessManager {
log::info!("** Restarting qdrant **");
self.kill_process().await;

if let Err(err) = self.backup_working_dir().await {
log::error!(
"Failed to backup working dir {} to {}: {err:?}",
self.working_dir,
self.backup_dirs.front().expect("backup dir"),
);
}
self.backup_storage_dir().await.unwrap();

self.child_process = start_process(
&self.working_dir,
Expand Down
Loading