diff --git a/.github/workflows/run-crasher.yml b/.github/workflows/run-crasher.yml index 2c00d8b..c45fe3c 100644 --- a/.github/workflows/run-crasher.yml +++ b/.github/workflows/run-crasher.yml @@ -2,7 +2,7 @@ name: Crash on: push: - branches: [ "master" ] + branches: [ "*" ] schedule: # every day at 12 CET - cron: "0 14 * * *" @@ -14,10 +14,6 @@ on: crashing_time: description: "Duration in seconds, default is 3600s (1h)" default: 3600 - backup_working_dir: - description: "Backup working dir before each Qdrant restart" - type: boolean - default: false env: CARGO_TERM_COLOR: always @@ -89,9 +85,9 @@ jobs: cp -r qdrant-src/config qdrant crashing_time="${{ steps.default_inputs.outputs.crashing_time }}" - backup_working_dir="${{ inputs.backup_working_dir && 'qdrant-backup' }}" + backup_storage_dir="${{ format('{0}/qdrant/storage-backup', github.workspace) }}" - ./crash-things.sh qdrant ../qdrant-src/target/debug/qdrant 0.3 "$crashing_time" ${backup_working_dir:+"$backup_working_dir"} + ./crash-things.sh qdrant ../qdrant-src/target/debug/qdrant 0.3 "$crashing_time" "$backup_storage_dir" - name: Upload logs on failure uses: actions/upload-artifact@v4 if: failure() || cancelled() @@ -109,7 +105,6 @@ jobs: retention-days: 10 path: | qdrant/ - qdrant-backup/ - name: Send Notification if: failure() || cancelled() uses: slackapi/slack-github-action@v1.26.0 diff --git a/crash-things.sh b/crash-things.sh index 8886fa1..ee8aabd 100755 --- a/crash-things.sh +++ b/crash-things.sh @@ -6,7 +6,7 @@ QDRANT_DIR=${1:-./qdrant/} QDRANT_EXEC=${2:-target/debug/qdrant} CRASH_PROBABILITY=${3:-0.3} RUN_TIME=${4:-300} -QDRANT_BACKUP_DIRS=( "${@:5}" ) +QDRANT_BACKUP_DIR=${5:?backup directory is required} CRASHER_LOG=crasher.log QDRANT_LOG=../qdrant.log @@ -15,7 +15,7 @@ CRASHER_CMD=( cargo run --release -- --working-dir "$QDRANT_DIR" - ${QDRANT_BACKUP_DIRS[@]/#/--backup-working-dir } # this does not handle spaces 😬 + --storage-backup $QDRANT_BACKUP_DIR --exec-path "$QDRANT_EXEC" --crash-probability "$CRASH_PROBABILITY" ) diff --git a/readme.md b/readme.md index 44d7c46..c7eb83f 100644 --- a/readme.md +++ b/readme.md @@ -17,8 +17,8 @@ Usage: crasher [OPTIONS] --working-dir --exec-path Options: --working-dir Working directory for Qdrant data - --backup-working-dir - Backup working directory between Qdrant restarts (useful to debug storage recovery issues) + --storage-backup + Backup `storage` directory from `working_dir` between Qdrant restarts (useful to debug storage recovery issues) --exec-path Path to executable binary relative to `working_dir` --crash-probability diff --git a/src/args.rs b/src/args.rs index 957ac3a..301286d 100644 --- a/src/args.rs +++ b/src/args.rs @@ -7,9 +7,9 @@ pub struct Args { /// Working directory for Qdrant data #[arg(long)] pub working_dir: String, - /// Backup working directory between Qdrant restarts (useful to debug storage recovery issues) + /// Backup `storage` directory from `working_dir` between Qdrant restarts (useful to debug storage recovery issues) #[arg(long)] - pub backup_working_dir: Vec, + pub storage_backup: Option, /// Path to executable binary relative to `working_dir` #[arg(long)] pub exec_path: String, diff --git a/src/main.rs b/src/main.rs index a6a4ae8..3102030 100644 --- a/src/main.rs +++ b/src/main.rs @@ -77,7 +77,7 @@ async fn main() { let (rng_seed, mut workload_rng, mut chaos_rng) = create_rngs(args.rng_seed); // workload task - let workload = Workload::new( + let workload: Workload = Workload::new( collection_name, stopped.clone(), crash_lock.clone(), diff --git a/src/process.rs b/src/process.rs index e6c20d9..23c9629 100644 --- a/src/process.rs +++ b/src/process.rs @@ -4,8 +4,8 @@ use crate::util; use anyhow::Context as _; use qdrant_client::Qdrant; use rand::Rng; -use std::collections::VecDeque; use std::io; +use std::path::PathBuf; use std::process::exit; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; @@ -50,7 +50,7 @@ pub fn start_process( pub struct ProcessManager { pub working_dir: String, pub binary_path: String, - pub backup_dirs: VecDeque, + pub backup_dir: Option, pub child_process: Child, pub kill_on_drop: bool, pub cpu_quota: Option, @@ -58,13 +58,16 @@ pub struct ProcessManager { impl ProcessManager { pub fn from_args(args: &Args) -> io::Result { - let manager = Self::new( + let mut manager = Self::new( &args.working_dir, &args.exec_path, args.shutdown_on_error, args.cpu_quota, - )? - .with_backup_dirs(args.backup_working_dir.clone()); + )?; + + if let Some(storage_backup) = &args.storage_backup { + manager = manager.with_backup_dirs(storage_backup); + } Ok(manager) } @@ -80,15 +83,15 @@ impl ProcessManager { Ok(Self { working_dir: working_dir.to_string(), binary_path: binary_path.to_string(), - backup_dirs: VecDeque::new(), + backup_dir: None, child_process: child, kill_on_drop, cpu_quota, }) } - pub fn with_backup_dirs(mut self, backup_dirs: impl Into>) -> Self { - self.backup_dirs = backup_dirs.into(); + pub fn with_backup_dirs(mut self, backup_dir: impl Into) -> Self { + self.backup_dir = Some(backup_dir.into()); self } @@ -97,25 +100,27 @@ impl ProcessManager { self.child_process.kill().await.unwrap(); } - pub async fn backup_working_dir(&mut self) -> anyhow::Result<()> { - let Some(backup_dir) = self.backup_dirs.front() else { + pub async fn backup_storage_dir(&mut self) -> anyhow::Result<()> { + let Some(backup_dir) = &self.backup_dir else { return Ok(()); }; let backup_exists = fs::try_exists(backup_dir).await.with_context(|| { - format!("failed to query if backup working dir {backup_dir} exists") + format!("failed to query if backup storage dir {backup_dir} exists") })?; + let backup_dir = PathBuf::from(backup_dir); + let backup_dir_path = backup_dir.as_path(); + + let source_storage_dir = PathBuf::from(&self.working_dir).join("storage"); + if backup_exists { - fs::remove_dir_all(backup_dir) + fs::remove_dir_all(backup_dir_path) .await - .with_context(|| format!("failed to remove backup working dir {backup_dir}"))?; + .with_context(|| format!("failed to remove backup storage dir {backup_dir_path:?}"))?; } - util::copy_dir(&self.working_dir, backup_dir).await?; - - let backup_dir = self.backup_dirs.pop_front().expect("backup dir"); - self.backup_dirs.push_back(backup_dir); + util::copy_dir(&source_storage_dir, backup_dir_path).await?; Ok(()) } @@ -141,13 +146,7 @@ impl ProcessManager { log::info!("** Restarting qdrant **"); self.kill_process().await; - if let Err(err) = self.backup_working_dir().await { - log::error!( - "Failed to backup working dir {} to {}: {err:?}", - self.working_dir, - self.backup_dirs.front().expect("backup dir"), - ); - } + self.backup_storage_dir().await.unwrap(); self.child_process = start_process( &self.working_dir,