From 0b8f12e38130125b6ffbed2f6d0500937534266e Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 17:51:42 -0700 Subject: [PATCH 01/19] chore: boop dependencies --- Cargo.lock | 151 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 107 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7135c02..3d05d5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,18 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "ahash" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" -dependencies = [ - "cfg-if", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "aho-corasick" version = "1.1.4" @@ -582,6 +570,17 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "evmap" +version = "11.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8874945f036109c72242964c1174cf99434e30cfa45bf45fedc983f50046f8" +dependencies = [ + "hashbag", + "left-right", + "smallvec", +] + [[package]] name = "fastrand" version = "2.4.1" @@ -737,6 +736,21 @@ dependencies = [ "slab", ] +[[package]] +name = "generator" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f04ae4152da20c76fe800fa48659201d5cf627c5149ca0b707b69d7eef6cf9" +dependencies = [ + "cc", + "cfg-if", + "libc", + "log", + "rustversion", + "windows-link", + "windows-result", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -987,9 +1001,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" dependencies = [ "atomic-waker", "bytes", @@ -1004,6 +1018,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "hashbag" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7040a10f52cba493ddb09926e15d10a9d8a28043708a405931fe4c6f19fac064" + [[package]] name = "hashbrown" version = "0.15.5" @@ -1318,9 +1338,9 @@ dependencies = [ [[package]] name = "idna_adapter" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" dependencies = [ "icu_normalizer", "icu_properties", @@ -1401,10 +1421,12 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.95" +version = "0.3.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca" +checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf" dependencies = [ + "cfg-if", + "futures-util", "once_cell", "wasm-bindgen", ] @@ -1434,6 +1456,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" +[[package]] +name = "left-right" +version = "0.11.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f0c21e4c8ff95f487fb34e6f9182875f42c84cef966d29216bf115d9bba835a" +dependencies = [ + "crossbeam-utils", + "loom", + "slab", +] + [[package]] name = "libc" version = "0.2.186" @@ -1455,7 +1488,7 @@ dependencies = [ "bitflags", "libc", "plain", - "redox_syscall 0.7.4", + "redox_syscall 0.7.5", ] [[package]] @@ -1489,6 +1522,19 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "matchers" version = "0.2.0" @@ -1522,21 +1568,22 @@ checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "metrics" -version = "0.24.3" +version = "0.24.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8" +checksum = "ff56c2e7dce6bd462e3b8919986a617027481b1dcc703175b58cf9dd98a2f071" dependencies = [ - "ahash", "portable-atomic", + "rapidhash", ] [[package]] name = "metrics-exporter-prometheus" -version = "0.18.1" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3589659543c04c7dc5526ec858591015b87cd8746583b51b48ef4353f99dbcda" +checksum = "1db0d8f1fc9e62caebd0319e11eaec5822b0186c171568f0480b46a0137f9108" dependencies = [ "base64", + "evmap", "http-body-util", "hyper", "hyper-rustls", @@ -1554,9 +1601,9 @@ dependencies = [ [[package]] name = "metrics-util" -version = "0.20.1" +version = "0.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdfb1365fea27e6dd9dc1dbc19f570198bc86914533ad639dae939635f096be4" +checksum = "9e56997f084e57b045edf17c3ed8ba7f9f779c670df8206dfd1c736f4c02dc4a" dependencies = [ "crossbeam-epoch", "crossbeam-utils", @@ -1565,6 +1612,7 @@ dependencies = [ "quanta", "rand 0.9.4", "rand_xoshiro", + "rapidhash", "sketches-ddsketch", ] @@ -1938,6 +1986,15 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rapidhash" +version = "4.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59" +dependencies = [ + "rustversion", +] + [[package]] name = "raw-cpuid" version = "11.6.0" @@ -1958,9 +2015,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a" +checksum = "4666a1a60d8412eab19d94f6d13dcc9cea0a5ef4fdf6a5db306537413c661b1b" dependencies = [ "bitflags", ] @@ -2030,9 +2087,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.39" +version = "0.23.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c2c118cb077cca2822033836dfb1b975355dfb784b5e8da48f7b6c5db74e60e" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" dependencies = [ "aws-lc-rs", "once_cell", @@ -2097,6 +2154,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "scopeguard" version = "1.2.0" @@ -2628,9 +2691,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.52.1" +version = "1.52.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6" +checksum = "110a78583f19d5cdb2c5ccf321d1290344e71313c6c37d43520d386027d18386" dependencies = [ "bytes", "libc", @@ -2706,9 +2769,9 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.6.8" +version = "0.6.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +checksum = "68d6fdd9f81c2819c9a8b0e0cd91660e7746a8e6ea2ba7c6b2b057985f6bcb51" dependencies = [ "bitflags", "bytes", @@ -2959,9 +3022,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.118" +version = "0.2.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" +checksum = "df52b6d9b87e0c74c9edfa1eb2d9bf85e5d63515474513aa50fa181b3c4f5db1" dependencies = [ "cfg-if", "once_cell", @@ -2972,9 +3035,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.118" +version = "0.2.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" +checksum = "78b1041f495fb322e64aca85f5756b2172e35cd459376e67f2a6c9dffcedb103" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2982,9 +3045,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.118" +version = "0.2.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" +checksum = "9dcd0ff20416988a18ac686d4d4d0f6aae9ebf08a389ff5d29012b05af2a1b41" dependencies = [ "bumpalo", "proc-macro2", @@ -2995,9 +3058,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.118" +version = "0.2.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" +checksum = "49757b3c82ebf16c57d69365a142940b384176c24df52a087fb748e2085359ea" dependencies = [ "unicode-ident", ] @@ -3038,9 +3101,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.95" +version = "0.3.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d" +checksum = "2eadbac71025cd7b0834f20d1fe8472e8495821b4e9801eb0a60bd1f19827602" dependencies = [ "js-sys", "wasm-bindgen", From 199d746a351e1cf82b958b2923f3c15da0ebeac5 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:06:31 -0700 Subject: [PATCH 02/19] fix(dlq): capture permanently-failed jobs to DLQ before startup cleanup `WorkerRunner::run_until_cancelled` called `startup_cleanup_with_timeouts` synchronously at startup, whose third step `cleanup_permanently_failed_jobs` DELETEs from `_private_jobs` where `attempts >= max_attempts AND locked_at IS NULL`. The DLQ processor uses the same predicate to find candidates, but runs as a periodic background task that hadn't ticked yet. Net effect: any job that hit max_attempts during the previous run (or in the gap between its final failure and the next DLQ tick) was silently deleted on every worker restart instead of being captured to the DLQ. Fix: when DLQ is enabled, run `process_failed_jobs()` synchronously before startup cleanup. Also document the ordering invariant on `cleanup_permanently_failed_jobs` and `startup_cleanup_with_timeouts` for direct callers (ad-hoc maintenance scripts). Adds a regression test that stages a permanently-failed row in `_private_jobs`, runs the actual `WorkerRunner` startup path, and asserts the row reaches the DLQ. Verified to fail on the previous code. --- src/client/cleanup.rs | 30 +++++++++-- src/worker.rs | 36 +++++++++++++ tests/dlq_tests.rs | 117 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 179 insertions(+), 4 deletions(-) diff --git a/src/client/cleanup.rs b/src/client/cleanup.rs index e577a24..ab314a8 100644 --- a/src/client/cleanup.rs +++ b/src/client/cleanup.rs @@ -134,9 +134,25 @@ impl BackfillClient { /// remain in the main queue with `is_available = false`. These jobs /// will never be processed again and should be cleaned up. /// - /// Note: These jobs should already be captured to the DLQ by the task - /// handler or DLQ processor before reaching this state. This function - /// removes the leftover rows from the main queue. + /// # Important: ordering with the DLQ + /// + /// **This function deletes the same rows the DLQ processor uses as input.** + /// If you call it before `process_failed_jobs()` has captured those rows + /// into the DLQ, those jobs are lost forever — they leave the main queue + /// without ever reaching the DLQ. + /// + /// Safe usage when DLQ is enabled: + /// 1. Call `process_failed_jobs()` first (moves rows into DLQ). + /// 2. Then call this function (cleans up anything the DLQ processor + /// chose not to move — typically jobs with `max_attempts = 0`, which + /// the DLQ processor explicitly skips). + /// + /// `WorkerRunner::run_until_cancelled` already enforces this ordering at + /// startup. Direct callers (ad-hoc maintenance scripts, etc.) must enforce + /// it themselves. + /// + /// If DLQ is disabled, this function is the only cleanup mechanism and will + /// silently delete failed jobs — that is by design. /// /// # Returns /// Number of permanently failed jobs that were deleted @@ -186,6 +202,14 @@ impl BackfillClient { /// This allows configuring the stale lock thresholds for environments /// where the defaults aren't appropriate. /// + /// # DLQ ordering note + /// + /// This calls `cleanup_permanently_failed_jobs()`, which DELETEs rows from + /// `_private_jobs` where `attempts >= max_attempts`. If you run a DLQ, + /// **call `process_failed_jobs()` first** so those rows reach the DLQ + /// before they're deleted. `WorkerRunner::run_until_cancelled` does this + /// automatically; direct callers must do it themselves. + /// /// # Arguments /// * `queue_lock_timeout` - Timeout for queue locks (normally held for ms) /// * `job_lock_timeout` - Timeout for job locks (held during job execution) diff --git a/src/worker.rs b/src/worker.rs index ce2b1a8..5ba1c42 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -667,6 +667,42 @@ impl WorkerRunner { self.config.stale_lock_cleanup_interval.is_some() ); + // Move permanently-failed jobs to the DLQ BEFORE startup cleanup runs. + // + // `startup_cleanup_with_timeouts()` calls `cleanup_permanently_failed_jobs()` + // which deletes rows from `_private_jobs` where `attempts >= max_attempts`. + // The DLQ processor uses the SAME predicate to find candidates to move into + // the DLQ — but it runs as a periodic background task that hasn't ticked yet + // at startup. Without this pre-move, any job that hit `max_attempts` while + // the worker was down (or in the gap between final failure and the next + // DLQ-processor tick on the previous run) would be silently deleted by + // cleanup before it reached the DLQ. + // + // We only run this when DLQ is enabled. If the user opted out of DLQ via + // `dlq_processor_interval = None`, they've explicitly chosen the + // delete-failed-jobs behaviour and we leave it intact. + if self.config.dlq_processor_interval.is_some() { + match self.client.process_failed_jobs().await { + Ok(moved) if moved > 0 => { + log::info!( + "Pre-cleanup DLQ move captured permanently-failed jobs (moved: {})", + moved + ); + } + Ok(_) => { + // No failed jobs waiting; nothing to capture. + } + Err(e) => { + // If this fails, the next call to startup_cleanup may delete the + // affected jobs. We log loudly so an operator can investigate. + log::error!( + "Pre-cleanup DLQ move failed; jobs may be lost when cleanup runs: {}", + e + ); + } + } + } + // Run startup cleanup with configured timeouts if let Err(e) = self .client diff --git a/tests/dlq_tests.rs b/tests/dlq_tests.rs index 2e3fd11..54f7f9e 100644 --- a/tests/dlq_tests.rs +++ b/tests/dlq_tests.rs @@ -1,8 +1,11 @@ //! Dead Letter Queue integration tests -use backfill::{BackfillClient, BackfillError, DlqFilter, JobSpec, Priority, Queue}; +use std::time::Duration; + +use backfill::{BackfillClient, BackfillError, DlqFilter, JobSpec, Priority, Queue, WorkerConfig, WorkerRunner}; use chrono::Utc; use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; use uuid::Uuid; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -700,3 +703,115 @@ async fn test_dlq_with_different_priorities() { assert!(priorities.contains(&-10)); // FAST_HIGH assert!(priorities.contains(&5)); // BULK_LOW } + +/// Regression test for the DLQ-vs-cleanup startup race (P0-1). +/// +/// Before the fix, `WorkerRunner::run_until_cancelled` called +/// `startup_cleanup_with_timeouts` first, which DELETEs rows from `_private_jobs` +/// where `attempts >= max_attempts`. The DLQ processor was supposed to capture +/// those same rows, but it ran as a periodic background task that hadn't ticked +/// yet at startup. Net effect: jobs that hit max_attempts while the worker was +/// down (or in the gap before the next DLQ tick) were silently deleted. +/// +/// The fix runs `process_failed_jobs()` synchronously before cleanup when DLQ +/// is enabled. This test stages a permanently-failed job, runs the actual +/// worker startup path (`run_until_cancelled` via `spawn_background`), and +/// asserts the job lands in DLQ instead of being deleted. +#[tokio::test] +async fn test_worker_startup_moves_failed_jobs_to_dlq_before_cleanup() { + let database_url = + std::env::var("DATABASE_URL").unwrap_or_else(|_| "postgresql://localhost/backfill_test".to_string()); + + let pool = sqlx::postgres::PgPoolOptions::new() + .max_connections(5) + .connect(&database_url) + .await + .expect("connect to test database"); + + let schema = format!("test_dlq_startup_capture_{}", Uuid::new_v4().simple()); + let client = BackfillClient::with_pool_and_schema(pool.clone(), schema.clone()) + .await + .expect("create client"); + client.init_dlq().await.expect("init DLQ"); + + // Stage a job that will look "permanently failed" to the DLQ scanner. + let test_job = TestJob { + message: "must reach DLQ on worker startup".to_string(), + number: 4242, + }; + let outcome = client + .enqueue( + "test_job", + &test_job, + JobSpec { + max_attempts: Some(3), + ..Default::default() + }, + ) + .await + .expect("enqueue"); + let job = outcome.unwrap(); + let job_id = *job.id(); + + // Force `attempts = max_attempts` to simulate a job that exhausted retries + // before the previous worker shut down. + sqlx::query(&format!( + "UPDATE {}._private_jobs SET attempts = max_attempts WHERE id = $1", + schema + )) + .bind(job_id) + .execute(&pool) + .await + .expect("stage permanent failure"); + + // Sanity check: not in DLQ yet, still in main table. + let dlq_pre = client + .list_dlq_jobs(DlqFilter::default()) + .await + .expect("list DLQ pre"); + assert_eq!(dlq_pre.jobs.len(), 0, "DLQ should be empty before worker startup"); + + // Build and run a real WorkerRunner. The DLQ processor's tick interval is + // long enough that we know the only thing that can move the job to DLQ + // during this test window is the synchronous startup pre-move. + let config = WorkerConfig::new(database_url) + .with_schema(schema.clone()) + .with_poll_interval(Duration::from_millis(50)) + .with_dlq_processor_interval(Some(Duration::from_secs(60))); + let worker = WorkerRunner::builder(config) + .await + .expect("build runner") + .build() + .await + .expect("build worker"); + + let token = CancellationToken::new(); + let handle = worker.spawn_background(token.clone()); + + // Give startup enough time to run the pre-move + cleanup sequence. + // (No background DLQ tick will fire — we set the interval to 60s.) + tokio::time::sleep(Duration::from_millis(750)).await; + token.cancel(); + let _ = handle.await; + + // Verify: the job is in DLQ, not deleted. + let dlq_post = client + .list_dlq_jobs(DlqFilter::default()) + .await + .expect("list DLQ post"); + assert_eq!( + dlq_post.jobs.len(), + 1, + "permanently-failed job must reach DLQ during worker startup, not be deleted by cleanup" + ); + let dlq_job = &dlq_post.jobs[0]; + assert_eq!(dlq_job.original_job_id, Some(job_id)); + assert_eq!(dlq_job.task_identifier, "test_job"); + + // Cleanup + sqlx::query(&format!("DROP SCHEMA IF EXISTS {} CASCADE", schema)) + .execute(&pool) + .await + .ok(); + pool.close().await; +} From cc7f98168e284177e9ad2413c29c6afcd0fd6922 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:08:15 -0700 Subject: [PATCH 03/19] fix(retries): wire WorkerError::is_retryable to actual retry behaviour MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `WorkerError::is_retryable()` and `classify_from_message()` advertised that non-retryable error variants (`ValidationFailed`, `Unauthorized`, `InvalidInput`, etc.) would skip retries — but no production code path ever consulted them. graphile_worker treats every `Err` identically and reschedules on its `exp(min(attempts, 10))` second formula until max_attempts is reached. A `ValidationFailed` from a malformed payload would chew through ~6 hours of wasted retries before reaching the DLQ. Adds `PermanentFailurePlugin`, auto-registered alongside `DlqCleanupPlugin` when the DLQ is enabled. On `JobFail` it classifies `ctx.error` via `WorkerError::classify_from_message`. If the result is non-retryable, it runs `UPDATE _private_jobs SET attempts = max_attempts WHERE id = $1 AND locked_by = $worker_id` so the next `get_job()` skips the row and the next DLQ-processor tick captures it. Implementation note: graphile_worker's own `permanently_fail_jobs` SQL function would be the natural fit, but its `WHERE locked_at IS NULL OR locked_at < NOW() - 4h` guard is a no-op inside a `JobFail` hook — the worker (us) still owns the lock at that point. The custom UPDATE filters by `locked_by = ctx.worker_id` instead, which is a tighter safety property (only the lock holder can override its own row). graphile_worker clears locked_at immediately after the hook returns; the `attempts = max_attempts` we wrote sticks because `fail_job` only mutates last_error, run_at, and the lock fields. Two regression tests in plugin_tests.rs: - `test_non_retryable_error_short_circuits_retries`: ValidationFailed reaches max_attempts after a single execution and lands in DLQ. - `test_retryable_error_does_not_short_circuit`: TemporaryUnavailable is left alone (attempts == 1 after one execution). The first test was verified to fail without the plugin. --- src/lib.rs | 2 + src/permanent_failure_plugin.rs | 129 +++++++++++++++++++++++++++++++ src/worker.rs | 13 +++- tests/plugin_tests.rs | 132 ++++++++++++++++++++++++++++++++ 4 files changed, 274 insertions(+), 2 deletions(-) create mode 100644 src/permanent_failure_plugin.rs diff --git a/src/lib.rs b/src/lib.rs index e4aaf41..e082e53 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -113,6 +113,7 @@ use serde::Serialize; mod client; mod dlq_cleanup_plugin; mod errors; +mod permanent_failure_plugin; mod priorities; mod retries; mod worker; @@ -128,6 +129,7 @@ pub use client::cleanup::{ pub use client::*; pub use dlq_cleanup_plugin::DlqCleanupPlugin; pub use errors::{BackfillError, WorkerError}; +pub use permanent_failure_plugin::PermanentFailurePlugin; pub use priorities::*; pub use retries::*; pub use worker::*; diff --git a/src/permanent_failure_plugin.rs b/src/permanent_failure_plugin.rs new file mode 100644 index 0000000..5fd86cc --- /dev/null +++ b/src/permanent_failure_plugin.rs @@ -0,0 +1,129 @@ +//! Built-in plugin that short-circuits retries for non-retryable errors. +//! +//! `WorkerError` distinguishes retryable failures (`ConnectionTimeout`, +//! `RateLimitExceeded`, …) from non-retryable failures (`ValidationFailed`, +//! `Unauthorized`, …). Without this plugin, graphile_worker treats every +//! `Err` identically and keeps retrying until `max_attempts` is exhausted — +//! which can mean six hours of pointless retries for a malformed payload. +//! +//! With this plugin registered, when a handler returns an error that +//! `WorkerError::classify_from_message` flags as non-retryable, the job's +//! `attempts` is set to `max_attempts` so it lands in the DLQ on the next +//! processor tick instead of retrying. +//! +//! The plugin is auto-registered when the DLQ processor is enabled +//! (`WorkerConfig::dlq_processor_interval = Some(_)`). Without DLQ enabled +//! it would set `attempts = max_attempts` and the row would just sit in +//! `_private_jobs` forever — pointless and confusing — so the gating is +//! intentional. +//! +//! # Implementation note: why we can't use `permanently_fail_jobs` +//! +//! graphile_worker exposes a `permanently_fail_jobs(job_ids, reason)` SQL +//! function that does what we want, but it has a `WHERE locked_at IS NULL OR +//! locked_at < NOW() - 4h` guard to prevent racing with an active worker. +//! Inside the `JobFail` hook the worker (us) still owns the lock — +//! graphile_worker doesn't clear `locked_at` until `fail_job` runs *after* +//! our hook returns. So `permanently_fail_jobs` would silently no-op. +//! +//! We work around this with a direct UPDATE that filters by `locked_by = +//! ctx.worker_id` instead of the `locked_at IS NULL` guard. Same safety +//! property (only this worker can update its own row) but compatible with +//! the in-flight lock state. + +use graphile_worker::{HookRegistry, JobFail, JobFailContext, Plugin}; + +use crate::{BackfillClient, WorkerError}; + +/// Plugin that converts non-retryable handler errors into immediate DLQ +/// candidates by short-circuiting retry attempts. +/// +/// Auto-registered by `WorkerRunner` when DLQ is enabled. +#[derive(Clone)] +pub struct PermanentFailurePlugin { + client: BackfillClient, +} + +impl PermanentFailurePlugin { + /// Create a new permanent-failure plugin. + pub fn new(client: BackfillClient) -> Self { + Self { client } + } +} + +impl Plugin for PermanentFailurePlugin { + fn register(self, hooks: &mut HookRegistry) { + let client = self.client; + hooks.on(JobFail, move |ctx: JobFailContext| { + let client = client.clone(); + async move { + let classified = WorkerError::classify_from_message(ctx.error.clone()); + if classified.is_retryable() { + // Retryable error — let graphile_worker reschedule normally. + return; + } + + let job_id = *ctx.job.id(); + let task = ctx.job.task_identifier().to_string(); + + // Set attempts = max_attempts so the next `get_job()` doesn't + // pick this up (its predicate is `is_available`, which is a + // generated column = `locked_at IS NULL AND attempts < + // max_attempts`). The next DLQ processor tick will scan this + // row and move it to the DLQ. + // + // We filter by `locked_by = $3` to ensure we only mutate the + // row while we still own its lock. graphile_worker's + // `fail_job` will run immediately after this hook returns and + // will clear locked_at; the `attempts = max_attempts` we + // wrote here will persist (fail_job doesn't touch attempts). + let sql = format!( + "UPDATE {schema}._private_jobs \ + SET attempts = max_attempts, last_error = $2, updated_at = now() \ + WHERE id = $1 AND locked_by = $3", + schema = client.schema() + ); + let reason = format!("Permanent failure (non-retryable error): {}", ctx.error); + + match sqlx::query(&sql) + .bind(job_id) + .bind(&reason) + .bind(&ctx.worker_id) + .execute(client.pool()) + .await + { + Ok(result) if result.rows_affected() > 0 => { + log::info!( + "Short-circuited retries for non-retryable error (job_id: {}, task: {}, error: {})", + job_id, + task, + ctx.error + ); + } + Ok(_) => { + // No row updated — likely the lock changed hands or + // the row was deleted between fetch and hook. Rare + // but not catastrophic; fall back to normal retry. + log::warn!( + "Permanent-failure short-circuit affected zero rows (job_id: {}, task: {})", + job_id, + task + ); + } + Err(e) => { + // DB error — log and continue. graphile_worker's + // fail_job will run next and the job will retry once + // more (and likely fail again, hitting this hook on + // the next attempt). + log::warn!( + "Permanent-failure short-circuit failed; job will retry instead (job_id: {}, task: {}, error: {})", + job_id, + task, + e + ); + } + } + } + }); + } +} diff --git a/src/worker.rs b/src/worker.rs index 5ba1c42..ca5988a 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -518,11 +518,20 @@ impl WorkerRunner { worker_options = applier(worker_options); } - // If DLQ is enabled, add the cleanup plugin that removes DLQ entries - // when jobs with matching job_keys complete successfully + // If DLQ is enabled, register the built-in plugins that wire backfill's + // error semantics through to graphile_worker: + // + // - DlqCleanupPlugin: when a requeued DLQ job completes successfully, + // removes the matching DLQ entry so it isn't requeued again. + // - PermanentFailurePlugin: when a handler returns a non-retryable + // `WorkerError` variant (ValidationFailed, Unauthorized, etc.), + // short-circuits remaining retry attempts so the job lands in the + // DLQ on the next processor tick instead of waiting hours. if self.config.dlq_processor_interval.is_some() { let cleanup_plugin = crate::DlqCleanupPlugin::new(self.client.clone()); worker_options = worker_options.add_plugin(cleanup_plugin); + let permanent_failure_plugin = crate::PermanentFailurePlugin::new(self.client.clone()); + worker_options = worker_options.add_plugin(permanent_failure_plugin); } worker_options diff --git a/tests/plugin_tests.rs b/tests/plugin_tests.rs index fdd2355..2b3c2ec 100644 --- a/tests/plugin_tests.rs +++ b/tests/plugin_tests.rs @@ -447,6 +447,138 @@ async fn test_multiple_plugins_called_in_order() -> Result<()> { .await } +/// Regression test for P0-3: non-retryable `WorkerError` variants must +/// short-circuit retries instead of running all the way up to `max_attempts`. +/// +/// `WorkerError::is_retryable()` and `classify_from_message()` exist as the +/// public contract, but until this test landed nothing actually consulted them +/// at runtime. graphile_worker treats every `Err` identically and reschedules +/// for `e^min(attempts,10)` seconds regardless of error type. The fix is the +/// auto-registered `PermanentFailurePlugin` (gated on `dlq_processor_interval = +/// Some(_)`), which on a non-retryable error rewrites the row's `attempts` to +/// `max_attempts` so the next get_job() ignores it and the next DLQ tick +/// captures it. +/// +/// This test enqueues a job whose handler always returns +/// `WorkerError::ValidationFailed` (non-retryable), runs the worker once, and +/// asserts the row's attempts immediately reached `max_attempts` after a +/// single execution. +#[tokio::test] +async fn test_non_retryable_error_short_circuits_retries() -> Result<()> { + with_isolated_schema(|client| async move { + // Enqueue a job that will always return ValidationFailed (non-retryable + // because FailJob with should_retry=false hits the else-branch). + client + .enqueue( + "fail_job", + &FailJob { should_retry: false }, + JobSpec { + max_attempts: Some(5), + job_key: Some("non_retryable_short_circuit".to_string()), + ..Default::default() + }, + ) + .await?; + + let config = WorkerConfig::new(get_test_database_url()) + .with_schema(client.schema().to_string()) + .with_poll_interval(Duration::from_millis(50)); + // dlq_processor_interval defaults to Some(60s), which is what triggers + // PermanentFailurePlugin auto-registration. Don't override it. + + let worker = WorkerRunner::builder(config) + .await? + .define_job::() + .build() + .await?; + + // Run once. The handler runs, returns ValidationFailed, JobFail hook + // fires, our plugin classifies the error as non-retryable, and rewrites + // attempts to max_attempts. + worker.process_available_jobs().await?; + + // Verify: attempts hit max_attempts after a single execution rather + // than incrementing by one per retry. + let row: (i16, i16) = sqlx::query_as(&format!( + "SELECT attempts, max_attempts FROM {}._private_jobs LIMIT 1", + client.schema() + )) + .fetch_one(client.pool()) + .await?; + let (attempts, max_attempts) = row; + assert_eq!( + attempts, max_attempts, + "Non-retryable error should short-circuit retries: attempts = max_attempts \ + (got attempts={}, max_attempts={})", + attempts, max_attempts + ); + + // The DLQ processor would normally pick this up on its next tick. We + // can call it directly to verify the end-to-end behaviour: the job + // ends up in the DLQ within a single retry-cycle's worth of time. + let moved = client.process_failed_jobs().await?; + assert_eq!(moved, 1, "permanent-failure short-circuit should make the job DLQ-eligible"); + + let dlq = client.list_dlq_jobs(DlqFilter::default()).await?; + assert_eq!(dlq.jobs.len(), 1); + assert_eq!(dlq.jobs[0].task_identifier, "fail_job"); + + Ok(()) + }) + .await +} + +#[tokio::test] +async fn test_retryable_error_does_not_short_circuit() -> Result<()> { + // Mirror of the previous test: confirms the plugin only fires for + // non-retryable errors, leaving the normal exponential-backoff path + // intact for everything else. + with_isolated_schema(|client| async move { + client + .enqueue( + "fail_job", + // should_retry=true with attempt<3 returns TemporaryUnavailable + // (retryable); on later attempts it would return ValidationFailed, + // but with max_attempts=2 we'll only see one execution. + &FailJob { should_retry: true }, + JobSpec { + max_attempts: Some(2), + job_key: Some("retryable_no_short_circuit".to_string()), + ..Default::default() + }, + ) + .await?; + + let config = WorkerConfig::new(get_test_database_url()) + .with_schema(client.schema().to_string()) + .with_poll_interval(Duration::from_millis(50)); + + let worker = WorkerRunner::builder(config) + .await? + .define_job::() + .build() + .await?; + + worker.process_available_jobs().await?; + + let (attempts, max_attempts): (i16, i16) = sqlx::query_as(&format!( + "SELECT attempts, max_attempts FROM {}._private_jobs LIMIT 1", + client.schema() + )) + .fetch_one(client.pool()) + .await?; + assert_eq!( + attempts, 1, + "Retryable error must NOT short-circuit; expected attempts=1 after one execution \ + (got attempts={}, max_attempts={})", + attempts, max_attempts + ); + + Ok(()) + }) + .await +} + #[tokio::test] async fn test_plugin_worker_lifecycle_hooks() -> Result<()> { with_isolated_schema(|client| async move { From de4ecd8e36debc10bafe0ea7cc882ff8c3cff640 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:11:29 -0700 Subject: [PATCH 04/19] docs(retries): mark dead RetryPolicy timing fields as deprecated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `RetryPolicy.{initial_delay, max_delay, backoff_multiplier, jitter_factor}` were stored on the struct but never reached graphile_worker — only `max_attempts` is forwarded by `From for GraphileJobSpec`. graphile_worker uses a hard-coded `exp(min(attempts, 10))` second SQL formula for every retry. So `RetryPolicy::fast()` and `RetryPolicy::conservative()` produced identical retry timing in practice even though the docs promised "100ms-30s delays" vs. "1 min - 8 hour delays". This commit makes the fact match the promise: - Marks the unused math helpers (`RetryPolicy::new`, `with_jitter`, `calculate_delay`, `calculate_retry_time`, and `JobSpec::calculate_retry_time`) as `#[deprecated(since = "1.2.0")]` with notes pointing users at `RetryPolicy { max_attempts: n, ..Default::default() }` or the presets. - Rewrites the rustdoc on `RetryPolicy`, on each preset, on the `with_*_retries` builders, and on the `enqueue_*_with_retries` convenience helpers to describe what actually happens (only `max_attempts` differs across presets, fixed exp-backoff timing). - Updates the lib.rs module-level rustdoc. - Migrates `examples/enqueue_jobs.rs` to the recommended pattern so it doesn't trigger the new deprecation warnings. - Updates README.md to drop the false delay-range claims and replace the wrong "Pre-configured Fast/Bulk queues" / "Custom(name)" listing with the actual `Queue::Parallel` / `Queue::Serial(name)` enum. - Updates docs/02-dlq.md to mark the post-#9 "queue_name shows as default" warning as resolved (it was the change in v1.1.1 that fixed this). The struct fields themselves stay public for source-compatibility with existing struct-literal construction. Per-job backoff customization needs upstream graphile_worker support and is deferred. --- README.md | 80 ++++++++++++++++++++--------- docs/02-dlq.md | 21 +++++--- examples/enqueue_jobs.rs | 17 +++--- src/lib.rs | 70 +++++++++++++++++++------ src/retries.rs | 108 +++++++++++++++++++++++++++++++++------ 5 files changed, 224 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index 0f22321..4a46108 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # backfill [![CI](https://github.com/ceejbot/backfill/workflows/CI/badge.svg)](https://github.com/ceejbot/backfill/actions) -[![Coverage](https://img.shields.io/badge/coverage-64.67%25-yellow)](https://github.com/ceejbot/backfill/actions) [![Security](https://github.com/ceejbot/backfill/actions/workflows/security.yml/badge.svg)](https://github.com/ceejbot/backfill/actions/workflows/security.yml) A boringly-named priority queue system for doing async work. This library and work process wrap the the [graphile_worker crate](https://lib.rs/crates/graphile_worker) to do things the way I want to do them. It's unlikely you'll want to do things exactly this way, but perhaps you can learn by reading the code, or get a jumpstart by borrowing open-source code, or heck, maybe this will do what you need. @@ -10,37 +9,70 @@ A boringly-named priority queue system for doing async work. This library and wo This is a postgres-backed async work queue library that is a set of conveniences and features on top of the rust port of Graphile Worker. It gives you a library you can integrate with your own project to handle background tasks. -> **Status**: Core features are complete and tested (64.67%% test coverage, 55 tests). The library is suitable for production use for job enqueueing, worker processing, and DLQ management. The Admin API (feature-gated) is experimental. See [CHANGELOG.md](CHANGELOG.md) for details and [Known Limitations](docs/02-dlq.md#known-limitations). +> **Status**: Core features are complete and covered by an integration test +> suite. The library is suitable for production use for job enqueueing, +> worker processing, and DLQ management. The Admin API (feature-gated) is +> experimental. See [CHANGELOG.md](CHANGELOG.md) for details and +> [Known Limitations](docs/02-dlq.md#known-limitations). ### What's New Over graphile_worker -Built on top of `graphile_worker` (v0.8.6), backfill adds these production-ready features: - -- 🎯 **Priority System** - Six-level priority queue (EMERGENCY to BULK_LOWEST) with numeric priority values -- 📦 **Named Queues** - Pre-configured Fast/Bulk queues plus custom queue support -- 🔄 **Smart Retry Policies** - Exponential backoff with jitter (fast/aggressive/conservative presets) -- 💀 **Dead Letter Queue (DLQ)** - Automatic failed job handling with query/requeue/deletion APIs -- 📊 **Comprehensive Metrics** - Prometheus-compatible metrics for jobs, DLQ, and database operations -- 🛠️ **High-Level Client API** - `BackfillClient` with ergonomic enqueueing helpers -- 🏃 **Flexible Worker Patterns** - `WorkerRunner` supporting tokio::select!, background tasks, and one-shot processing -- 🔧 **Admin API** - Optional Axum router for HTTP-based job management (experimental) -- 📝 **Convenience Functions** - `enqueue_fast()`, `enqueue_bulk()`, `enqueue_critical()`, etc. -- 🧹 **Stale Lock Cleanup** - Automatic cleanup of orphaned locks from crashed workers (startup + periodic) +Built on top of `graphile_worker` (v0.11.x), backfill adds these production-ready features: + +- 🎯 **Priority System** — Six-level priority enum (EMERGENCY=-20 down through + BULK_LOWEST=10), mapped through to graphile_worker's `priority asc` fetch + ordering. +- 📦 **Parallel + Serial queues** — `Queue::Parallel` (default, jobs run + concurrently across workers) or `Queue::Serial(name)` (one-job-at-a-time + per named queue, for rate limiting or per-entity ordering). +- 🔄 **Retry policy presets** — `fast` / `aggressive` / `conservative` + presets that differ in `max_attempts`. Note: backoff *timing* is fixed by + graphile_worker (see [`docs/02-dlq.md`](docs/02-dlq.md)) — only the + attempt count is configurable. +- 💀 **Dead Letter Queue (DLQ)** — Automatic failed-job handling with + query/requeue/deletion APIs. Includes a permanent-failure short-circuit + plugin: handlers that return non-retryable `WorkerError` variants land in + the DLQ on the first failure rather than waiting for `max_attempts` to + exhaust. +- 📊 **Comprehensive Metrics** — Prometheus-compatible metrics for jobs, + DLQ, and database operations. +- 🛠️ **High-Level Client API** — `BackfillClient` with ergonomic enqueueing + helpers. +- 🏃 **Flexible Worker Patterns** — `WorkerRunner` supporting + `tokio::select!`, background tasks, and one-shot processing. +- 🔧 **Admin API** — Optional Axum router for HTTP-based job management + (experimental). +- 📝 **Convenience Functions** — `enqueue_fast()`, `enqueue_bulk()`, + `enqueue_critical()`, etc. +- 🧹 **Stale Lock Cleanup** — Automatic cleanup of orphaned locks from + crashed workers (startup + periodic). Ordered correctly with the DLQ + scanner so failed jobs aren't lost across restarts. All built on graphile_worker's rock-solid foundation of PostgreSQL SKIP LOCKED and LISTEN/NOTIFY. ### Features -- **Priority queues**: EMERGENCY, FAST_HIGH, FAST_DEFAULT, BULK_DEFAULT, BULK_LOW, BULK_LOWEST -- **Named queues**: Fast, Bulk, DeadLetter, Custom(name) -- **Scheduling**: Immediate or delayed execution with `run_at` -- **Idempotency**: Use `job_key` for deduplication -- **Exponential backoff**: Built-in retry policies with jitter to prevent thundering herds -- **Dead letter queue**: Handling jobs that experience un-retryable failures or exceed their retry limits -- **Error handling**: Automatic retry classification -- **Metrics**: Comprehensive metrics via the `metrics` crate - bring your own exporter (Prometheus, StatsD, etc.) -- **Monitoring**: Structured logging and tracing throughout -- **Building blocks for an axum admin api**: via a router you can mount on your own axum api server +- **Priority queues**: EMERGENCY (-20), FAST_HIGH (-10), FAST_DEFAULT (-5), + BULK_DEFAULT (0), BULK_LOW (5), BULK_LOWEST (10) — lower number = higher + priority. +- **Queue types**: `Queue::Parallel` (default), `Queue::Serial(name)` — plus + `Queue::serial_for(entity, id)` for per-entity ordering. +- **Scheduling**: Immediate or delayed execution with `run_at`. +- **Idempotency**: Use `job_key` for deduplication. +- **Retries**: Configurable `max_attempts` per job; graphile_worker handles + the exponential-backoff schedule (`exp(min(attempts, 10))` seconds, capped + at ~6h per retry). +- **Dead letter queue**: Automatic capture of jobs that exceed their retry + limits or return non-retryable errors. Includes a synchronous startup + pre-move so DLQ doesn't lose jobs across worker restarts. +- **Error classification**: `WorkerError` variants split into retryable and + non-retryable; non-retryable errors short-circuit retries to DLQ via an + auto-registered lifecycle plugin. +- **Metrics**: Comprehensive metrics via the `metrics` crate — bring your + own exporter (Prometheus, StatsD, etc.). +- **Monitoring**: Structured logging and tracing throughout. +- **Building blocks for an axum admin api**: via a router you can mount on + your own axum api server. Look at the `examples/` directory and the readme there for practical usage examples. diff --git a/docs/02-dlq.md b/docs/02-dlq.md index f4ee570..55f5511 100644 --- a/docs/02-dlq.md +++ b/docs/02-dlq.md @@ -880,16 +880,21 @@ The DLQ system is fully functional for production use, but has a few known limit ### 1. Queue Name Tracking -**Issue**: DLQ entries may show `queue_name` as `"default"` even if the job ran in a different queue (e.g., "fast" or "bulk"). +**Status**: Fixed in v1.1.1 (PR #9). -**Cause**: The GraphileWorker `Job` struct doesn't expose the queue name field, so when jobs are moved to the DLQ, the queue name defaults to `"default"`. +DLQ entries now correctly preserve the queue type of the original job: -**Workarounds**: -- The `task_identifier` field is always accurate and can be used for filtering -- Job priority is preserved, which often correlates with queue assignment -- For critical workflows, track queue assignment in your application logs or metrics +- Parallel jobs (`Queue::Parallel`) are stored with an empty `queue_name`. + When requeued, they go back to `Queue::Parallel` — concurrent execution + is preserved. +- Serial jobs (`Queue::Serial(name)`) are stored with their queue name. + When requeued, they go back to `Queue::Serial(name)` — single-job-at-a- + time semantics are preserved. -**Future**: This will be resolved when GraphileWorker exposes queue_name on the Job struct, or when we implement direct database querying. +The DDL retains a `DEFAULT 'default'` clause for the `queue_name` column for +schema compatibility, but it is never used by the production code path — +`add_to_dlq` and `process_failed_jobs` always pass an explicit value +(possibly the empty string for parallel jobs). ### 2. Payload Visibility @@ -929,7 +934,7 @@ These limitations are minor and don't affect the core DLQ functionality: - ✅ **Error message capture** - Fully functional - ✅ **Requeuing workflows** - Production-ready - ✅ **Statistics and monitoring** - Complete -- ⚠️ **Queue name tracking** - Shows "default" for all queues +- ✅ **Queue name tracking** - Fixed in v1.1.1 (parallel ↔ serial round-trip preserved) - ⚠️ **Payload inspection** - Requires direct DB access - ⚠️ **Job cancellation** - Not yet implemented diff --git a/examples/enqueue_jobs.rs b/examples/enqueue_jobs.rs index 7bc310b..d6c9528 100644 --- a/examples/enqueue_jobs.rs +++ b/examples/enqueue_jobs.rs @@ -248,14 +248,15 @@ async fn main() -> Result<(), Box> { outcome.expect("outcome should contain a job").id() ); - // Enqueue a job with custom retry policy - let custom_retry_policy = RetryPolicy::new( - 6, // 6 attempts - std::time::Duration::from_millis(500), // Start with 500ms - std::time::Duration::from_secs(60), // Cap at 1 minute - 1.8, // 1.8x multiplier - ) - .with_jitter(0.2); // 20% jitter + // Enqueue a job with custom retry settings. + // Note: only `max_attempts` affects runtime behaviour. graphile_worker + // schedules retries via a fixed `exp(min(attempts, 10))` second formula, + // so the timing fields on RetryPolicy are not honored. See the docs on + // `RetryPolicy` for the full story. + let custom_retry_policy = RetryPolicy { + max_attempts: 6, + ..Default::default() + }; let custom_job = GenerateReportJob { report_type: "analytics_summary".to_string(), diff --git a/src/lib.rs b/src/lib.rs index e082e53..905f2b0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,8 +9,14 @@ //! - **Parallel execution** by default - jobs run concurrently across all //! workers //! - **Serial queues** when you need ordering or rate limiting -//! - **Exponential backoff** with jitter to prevent thundering herds -//! - **Flexible retry policies** (fast, aggressive, conservative, or custom) +//! - **Exponential backoff retries** via graphile_worker (timing is +//! `exp(min(attempts, 10))` seconds; fixed, not per-job tunable — see +//! [`RetryPolicy`]) +//! - **Configurable max-attempts per job** with `fast`, `aggressive`, and +//! `conservative` presets +//! - **Permanent-failure short-circuit** — non-retryable `WorkerError` +//! variants land in the DLQ on the first failure instead of waiting for +//! `max_attempts` to exhaust //! - **Dead letter queue** handling for failed jobs //! - **Type-safe job handlers** using Rust's type system //! - **Low-latency execution** via PostgreSQL LISTEN/NOTIFY @@ -362,14 +368,22 @@ impl Default for JobSpec { } impl JobSpec { - /// Create a JobSpec with exponential backoff retry policy + /// Attach a [`RetryPolicy`] to this JobSpec. + /// + /// Sets `max_attempts` from the policy. Note that backoff timing fields + /// on the policy are stored but not honored at runtime — see + /// [`RetryPolicy`]. pub fn with_retry_policy(mut self, retry_policy: RetryPolicy) -> Self { self.max_attempts = Some(retry_policy.max_attempts); self.retry_policy = Some(retry_policy); self } - /// Create a JobSpec optimized for fast retries + /// Configure for the `fast` preset: `max_attempts = 3`. + /// + /// In practice this differs from [`with_aggressive_retries`] and + /// [`with_conservative_retries`] only in the attempt count — see + /// [`RetryPolicy`]. pub fn with_fast_retries(mut self) -> Self { let policy = RetryPolicy::fast(); self.max_attempts = Some(policy.max_attempts); @@ -377,7 +391,10 @@ impl JobSpec { self } - /// Create a JobSpec optimized for aggressive retries + /// Configure for the `aggressive` preset: `max_attempts = 12`. + /// + /// At graphile_worker's fixed exp-backoff schedule, 12 attempts gives + /// roughly half a day of cumulative retry coverage before DLQ. pub fn with_aggressive_retries(mut self) -> Self { let policy = RetryPolicy::aggressive(); self.max_attempts = Some(policy.max_attempts); @@ -385,7 +402,7 @@ impl JobSpec { self } - /// Create a JobSpec optimized for conservative retries + /// Configure for the `conservative` preset: `max_attempts = 5`. pub fn with_conservative_retries(mut self) -> Self { let policy = RetryPolicy::conservative(); self.max_attempts = Some(policy.max_attempts); @@ -393,15 +410,28 @@ impl JobSpec { self } - /// Get the effective retry policy (returns default if none specified) + /// Get the effective retry policy (returns default if none specified). + /// + /// **Note:** Only `max_attempts` from the returned policy reaches + /// graphile_worker. See [`RetryPolicy`] for details. pub fn effective_retry_policy(&self) -> RetryPolicy { self.retry_policy.clone().unwrap_or_default() } - /// Calculate the next retry time for a failed job + /// Calculate what the next retry time *would* be under this spec's + /// policy. + /// + /// **Not used at runtime.** graphile_worker schedules retries via a + /// fixed SQL formula. This method is preserved as a utility but has no + /// effect on actual job behaviour. + #[deprecated( + since = "1.2.0", + note = "graphile_worker computes retry timing in SQL and ignores this method. Returns a value but has no runtime effect." + )] pub fn calculate_retry_time(&self, attempt: i32, failed_at: DateTime) -> Option> { let policy = self.effective_retry_policy(); if policy.should_retry(attempt) { + #[allow(deprecated)] Some(policy.calculate_retry_time(attempt, failed_at)) } else { None // No more retries @@ -504,10 +534,12 @@ where client.enqueue(task_identifier, payload, spec).await } -/// Enqueue a high-priority job with fast exponential backoff retries. +/// Enqueue a high-priority job configured for a low retry count (3 attempts). /// -/// Best for high-priority jobs that need quick retries (3 attempts, 100ms-30s -/// delays). +/// Use for jobs where rapid failure-to-DLQ is preferred over many retries. +/// graphile_worker's retry timing is fixed at `exp(min(attempts, 10))` seconds +/// regardless of policy — see [`RetryPolicy`] — so the only difference between +/// this and other `_with_retries` helpers is the attempt cap. pub async fn enqueue_fast_with_retries( client: &BackfillClient, task_identifier: &str, @@ -527,9 +559,12 @@ where client.enqueue(task_identifier, payload, spec).await } -/// Enqueue a critical job with aggressive exponential backoff retries. +/// Enqueue a critical job with a high retry count (12 attempts). /// -/// Best for critical jobs that must succeed (12 attempts, up to 4 hour delays). +/// Use for jobs that must eventually succeed if at all possible. graphile_worker +/// retries on a fixed `exp(min(attempts, 10))` second schedule, capping at +/// ~6h per retry — so 12 attempts gives roughly half a day of total retry +/// coverage. See [`RetryPolicy`] for the full timing. pub async fn enqueue_critical( client: &BackfillClient, task_identifier: &str, @@ -549,10 +584,13 @@ where client.enqueue(task_identifier, payload, spec).await } -/// Enqueue a bulk job with conservative exponential backoff retries. +/// Enqueue a bulk job with a moderate retry count (5 attempts via the +/// `conservative` preset). /// -/// Best for background jobs where consistency matters more than speed -/// (8 attempts, 1 min - 8 hour delays). +/// Use for background jobs that should be retried but where you don't want +/// many attempts. graphile_worker's retry timing is `exp(min(attempts, 10))` +/// seconds — see [`RetryPolicy`] — so this gives roughly 1s, 3s, 7s, 20s, +/// 55s before the job lands in DLQ. pub async fn enqueue_bulk_with_retries( client: &BackfillClient, task_identifier: &str, diff --git a/src/retries.rs b/src/retries.rs index d75f94d..609f4fa 100644 --- a/src/retries.rs +++ b/src/retries.rs @@ -1,19 +1,53 @@ -//! Retries: policy, implementation, and tests. +//! Retry policy. +//! +//! # ⚠️ Important: backoff timing is fixed, not configurable +//! +//! `RetryPolicy` exposes `initial_delay`, `max_delay`, `backoff_multiplier`, +//! and `jitter_factor` fields, but **graphile_worker does not honor them**. +//! graphile_worker schedules every retry on a hard-coded SQL formula: +//! +//! ```text +//! run_at = greatest(now(), run_at) + (exp(least(attempts, 10)) * interval '1 second') +//! ``` +//! +//! Concretely: ~1s after the 1st failure, ~2.7s after the 2nd, ~7.4s after +//! the 3rd, ~20s, ~55s, ~2.5min, ~6.7min, ~18min, ~49min, ~2.2h, ~6h (capped +//! at attempts ≥ 10). This formula applies to every job regardless of which +//! `RetryPolicy` the caller configured. +//! +//! **The only `RetryPolicy` field that actually affects job behaviour is +//! `max_attempts`**, which is forwarded to graphile_worker's `JobSpec`. +//! Methods that compute delays (`calculate_delay`, `calculate_retry_time`, +//! `with_jitter`) are deprecated — they were dead code masquerading as +//! configuration. The `fast()`, `aggressive()`, and `conservative()` presets +//! are kept for compatibility but in practice differ only in attempt count. +//! +//! Per-job backoff customisation requires upstream support in +//! `graphile_worker` and is not currently available. use std::time::Duration; -/// Configuration for exponential backoff retry policy. +/// Configuration for retry behaviour. +/// +/// **Only `max_attempts` affects runtime behaviour.** The other fields are +/// retained for source-compatibility but graphile_worker computes retry +/// timing from a fixed SQL formula (`exp(min(attempts, 10))` seconds) — see +/// the module-level docs for details. #[derive(Debug, Clone)] pub struct RetryPolicy { - /// Maximum number of retry attempts + /// Maximum number of retry attempts. **Honored at runtime.** pub max_attempts: i32, - /// Initial delay between retries + /// Initial delay between retries. **NOT honored** — graphile_worker uses + /// `exp(min(attempts, 10))` seconds regardless. pub initial_delay: Duration, - /// Maximum delay between retries + /// Maximum delay between retries. **NOT honored** — graphile_worker's + /// formula caps naturally at attempts ≥ 10 (~6 hours). pub max_delay: Duration, - /// Backoff multiplier (typically 2.0 for exponential backoff) + /// Backoff multiplier. **NOT honored** — graphile_worker's formula uses + /// `exp()`, not a configurable multiplier. pub backoff_multiplier: f64, - /// Add jitter to prevent thundering herd (0.0 to 1.0, default 0.1) + /// Jitter factor. **NOT honored** — graphile_worker's formula adds no + /// jitter. pub jitter_factor: f64, } @@ -30,7 +64,17 @@ impl Default for RetryPolicy { } impl RetryPolicy { - /// Create a new RetryPolicy with custom settings + /// Create a new RetryPolicy with custom settings. + /// + /// **Note:** Only `max_attempts` affects runtime behaviour. The other + /// arguments are stored on the returned policy but never reach + /// graphile_worker. See the module-level docs. + #[deprecated( + since = "1.2.0", + note = "graphile_worker honors only max_attempts; the timing arguments are dead config. \ + Construct directly with RetryPolicy { max_attempts: n, ..Default::default() } \ + or use a preset like RetryPolicy::fast()." + )] pub fn new(max_attempts: i32, initial_delay: Duration, max_delay: Duration, backoff_multiplier: f64) -> Self { Self { max_attempts, @@ -41,13 +85,26 @@ impl RetryPolicy { } } - /// Create a RetryPolicy with jitter configuration + /// Configure jitter on this policy. **Not applied at runtime.** + #[deprecated( + since = "1.2.0", + note = "jitter_factor is not honored by graphile_worker; this method has no runtime effect." + )] pub fn with_jitter(mut self, jitter_factor: f64) -> Self { self.jitter_factor = jitter_factor.clamp(0.0, 1.0); self } - /// Calculate the delay for a specific attempt number (0-based) + /// Calculate what the delay *would* be for a given attempt number under + /// this policy. + /// + /// **Not used at runtime.** graphile_worker schedules retries via a + /// fixed SQL formula and ignores this calculation. Retained as a + /// utility for tests and future use. + #[deprecated( + since = "1.2.0", + note = "graphile_worker computes retry timing in SQL and ignores this method. Returns a value but has no runtime effect." + )] pub fn calculate_delay(&self, attempt: i32) -> Duration { if attempt >= self.max_attempts { return Duration::ZERO; @@ -81,12 +138,20 @@ impl RetryPolicy { Duration::from_millis(final_delay_ms as u64) } - /// Calculate the run_at time for a retry + /// Calculate what the run_at time *would* be for a retry under this + /// policy. + /// + /// **Not used at runtime.** See [`RetryPolicy::calculate_delay`]. + #[deprecated( + since = "1.2.0", + note = "graphile_worker computes retry timing in SQL and ignores this method. Returns a value but has no runtime effect." + )] pub fn calculate_retry_time( &self, attempt: i32, base_time: chrono::DateTime, ) -> chrono::DateTime { + #[allow(deprecated)] let delay = self.calculate_delay(attempt); base_time + chrono::Duration::from_std(delay).unwrap_or(chrono::Duration::MAX) } @@ -101,7 +166,11 @@ impl RetryPolicy { self.max_attempts + 1 // +1 for the initial attempt } - /// Create a fast retry policy for high-priority jobs + /// Preset for high-priority jobs that need quick turnaround. + /// + /// `max_attempts = 3`. Other fields are stored but not honored — see the + /// module-level docs. In practice this preset differs from `aggressive()` + /// and `conservative()` only in attempt count. pub fn fast() -> Self { Self { max_attempts: 3, @@ -112,23 +181,29 @@ impl RetryPolicy { } } - /// Create an aggressive retry policy for critical jobs + /// Preset for critical jobs that should keep retrying for a long time. + /// + /// `max_attempts = 12` — graphile_worker's exponential backoff caps at + /// ~6h per retry once attempts ≥ 10, so this gives roughly half a day of + /// retry coverage. Other fields are stored but not honored. pub fn aggressive() -> Self { Self { max_attempts: 12, initial_delay: Duration::from_millis(500), - max_delay: Duration::from_secs(600), // 10 minutes + max_delay: Duration::from_secs(600), backoff_multiplier: 1.5, jitter_factor: 0.15, } } - /// Create a conservative retry policy for bulk jobs + /// Preset for bulk jobs where consistency matters more than latency. + /// + /// `max_attempts = 5`. Other fields are stored but not honored. pub fn conservative() -> Self { Self { max_attempts: 5, initial_delay: Duration::from_secs(5), - max_delay: Duration::from_secs(1800), // 30 minutes + max_delay: Duration::from_secs(1800), backoff_multiplier: 2.5, jitter_factor: 0.2, } @@ -136,6 +211,7 @@ impl RetryPolicy { } #[cfg(test)] +#[allow(deprecated)] // tests still exercise the deprecated math helpers mod tests { use super::*; use crate::JobSpec; From 4492b356023ae793d78e8f4f9db57723ad250ef9 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:13:52 -0700 Subject: [PATCH 05/19] fix(dlq): list_dlq_jobs.total must respect filters The list query was filtering DLQ rows correctly via QueryBuilder, but the companion COUNT query was an unconditional `SELECT COUNT(*) FROM backfill_dlq` with no WHERE clause. Net effect: any paginated admin UI that filtered by task / queue / time range computed wrong page counts because the returned `total` reflected the entire table, not the filtered subset. Apply the same WHERE clauses to the count query. Adds a regression test that stages 5 jobs for task_a and 3 for task_b, then asserts that filtering by task returns the right per-task total (not the unfiltered 8). Verified to fail on the previous code. --- src/client/dlq.rs | 22 ++++++++++-- tests/dlq_tests.rs | 85 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 3 deletions(-) diff --git a/src/client/dlq.rs b/src/client/dlq.rs index 2bf2688..7c13b01 100644 --- a/src/client/dlq.rs +++ b/src/client/dlq.rs @@ -248,9 +248,25 @@ impl BackfillClient { crate::metrics::record_dlq_age(&job.task_identifier, age_seconds); } - // Get total count for pagination (simplified - could be optimized) - let count_query = format!("SELECT COUNT(*) FROM {}.backfill_dlq", self.schema); - let total: i64 = sqlx::query_scalar(&count_query).fetch_one(&self.pool).await?; + // Get total count for pagination, applying the same filters as the list + // query. Without this, paging UIs that filter (e.g., by task) compute + // wrong page counts because `total` would reflect every DLQ row, not + // the filtered subset. + let mut count_builder = sqlx::QueryBuilder::new("SELECT COUNT(*) FROM "); + count_builder.push(&self.schema).push(".backfill_dlq WHERE 1=1"); + if let Some(task) = &filter.task_identifier { + count_builder.push(" AND task_identifier = ").push_bind(task); + } + if let Some(queue) = &filter.queue_name { + count_builder.push(" AND queue_name = ").push_bind(queue); + } + if let Some(from) = filter.failed_after { + count_builder.push(" AND failed_at >= ").push_bind(from); + } + if let Some(to) = filter.failed_before { + count_builder.push(" AND failed_at <= ").push_bind(to); + } + let total: i64 = count_builder.build_query_scalar().fetch_one(&self.pool).await?; Ok(DlqJobList { jobs, diff --git a/tests/dlq_tests.rs b/tests/dlq_tests.rs index 54f7f9e..2227d45 100644 --- a/tests/dlq_tests.rs +++ b/tests/dlq_tests.rs @@ -704,6 +704,91 @@ async fn test_dlq_with_different_priorities() { assert!(priorities.contains(&5)); // BULK_LOW } +/// Regression test for P1-1: `list_dlq_jobs.total` must reflect the filtered +/// row count, not the unfiltered table size. +/// +/// Before the fix, the count query was an unconditional `SELECT COUNT(*) FROM +/// backfill_dlq` so any paginated admin UI that filtered (by task/queue/time) +/// computed wrong page counts. The list query honored filters; only the total +/// didn't. +#[tokio::test] +async fn test_list_dlq_jobs_total_respects_filters() { + let client = setup_test_client("dlq_total_filtered").await; + client.init_dlq().await.expect("DLQ init"); + + let test_job = TestJob { + message: "filter total test".to_string(), + number: 1, + }; + + // Stage 5 jobs with task "task_a" and 3 with task "task_b" — 8 total. + for _ in 0..5 { + let outcome = client + .enqueue("task_a", &test_job, JobSpec::default()) + .await + .expect("enqueue task_a"); + client + .add_to_dlq(&outcome.unwrap(), "task_a failure", None) + .await + .expect("add task_a to DLQ"); + } + for _ in 0..3 { + let outcome = client + .enqueue("task_b", &test_job, JobSpec::default()) + .await + .expect("enqueue task_b"); + client + .add_to_dlq(&outcome.unwrap(), "task_b failure", None) + .await + .expect("add task_b to DLQ"); + } + + // Unfiltered: total = 8. + let all = client + .list_dlq_jobs(DlqFilter::default()) + .await + .expect("list all"); + assert_eq!(all.total, 8); + + // Filter by task_a — must report 5, not 8. + let only_a = client + .list_dlq_jobs(DlqFilter { + task_identifier: Some("task_a".to_string()), + ..Default::default() + }) + .await + .expect("list task_a"); + assert_eq!( + only_a.total, 5, + "filtered total must reflect the filter, not the whole table" + ); + assert_eq!(only_a.jobs.len(), 5); + + // Filter by task_b — must report 3, not 8. + let only_b = client + .list_dlq_jobs(DlqFilter { + task_identifier: Some("task_b".to_string()), + ..Default::default() + }) + .await + .expect("list task_b"); + assert_eq!(only_b.total, 3); + assert_eq!(only_b.jobs.len(), 3); + + // Combine filter + pagination: total reflects filter, jobs reflect page. + let page_a = client + .list_dlq_jobs(DlqFilter { + task_identifier: Some("task_a".to_string()), + limit: Some(2), + offset: Some(0), + ..Default::default() + }) + .await + .expect("list task_a paginated"); + assert_eq!(page_a.total, 5, "total must ignore LIMIT/OFFSET"); + assert_eq!(page_a.jobs.len(), 2); +} + /// Regression test for the DLQ-vs-cleanup startup race (P0-1). /// /// Before the fix, `WorkerRunner::run_until_cancelled` called From 9561429e4554111e2a42107d11f5a338c21411b7 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:15:42 -0700 Subject: [PATCH 06/19] fix(worker)!: process_available_jobs returns () instead of fake usize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `process_available_jobs` was declared `Result` but unconditionally returned `Ok(0)`. The doc comment admitted this. The existing test even asserted `assert_eq!(processed, 0)` with a comment noting that "job counting isn't implemented yet" — i.e., the test was asserting the bug. Change the signature to `Result<(), BackfillError>`. Truthful, smaller, forces the matching `?`-discarding pattern that nearly every existing caller already uses. Update the doc to point users at lifecycle hook plugins (`JobComplete` / `JobFail`) for actual job-count instrumentation. BREAKING CHANGE: callers that captured the return value as a usize must remove the binding. None of the existing tests or examples relied on the value beyond asserting it was zero (the bug). --- src/worker.rs | 31 ++++++++++++------------------- tests/worker_tests.rs | 9 +++------ 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/src/worker.rs b/src/worker.rs index ca5988a..deb0142 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -828,33 +828,29 @@ impl WorkerRunner { tokio::spawn(async move { runner.run_until_cancelled(cancellation_token).await }) } - /// Process all currently available jobs and return + /// Process all currently available jobs and return. /// /// This method is designed for batch processing or testing scenarios where /// you want to process the current job queue without running a persistent /// worker. /// - /// This method processes all jobs that are currently available (where - /// `run_at <= now()`), respecting the configured concurrency limit. - /// Jobs are processed in priority order (lower priority number = higher - /// priority), then by `run_at` timestamp. + /// Processes all jobs where `run_at <= now()`, respecting the configured + /// concurrency limit. Jobs are processed in priority order (lower number = + /// higher priority), then by `run_at`. /// - /// The method returns when: - /// - All available jobs have been processed - /// - No more jobs are available to process + /// Returns when all available jobs have been processed. /// - /// Note: This method currently returns 0 as an accurate job count would - /// require additional instrumentation. The jobs are still processed - /// correctly. + /// # Counting jobs /// - /// # Returns - /// - /// Returns `Ok(0)` on success (job count tracking not yet implemented). + /// If you need to know how many jobs ran, register a `JobComplete` / + /// `JobFail` plugin via `add_plugin()` before building the worker — that's + /// the supported path for runtime job-count instrumentation. This method + /// no longer pretends to count for you. /// /// # Errors /// /// Returns an error if worker initialization or job processing fails. - pub async fn process_available_jobs(&self) -> Result { + pub async fn process_available_jobs(&self) -> Result<(), BackfillError> { log::info!("Processing available jobs (one-shot mode)"); // Create worker instance @@ -868,10 +864,7 @@ impl WorkerRunner { .map_err(|e| BackfillError::WorkerRuntime(e.to_string()))?; log::info!("Finished processing available jobs"); - - // Note: Returning 0 for now as accurate counting would require additional - // instrumentation. Consider using metrics or hooks to track job counts. - Ok(0) + Ok(()) } /// Get access to the underlying BackfillClient for job enqueueing and diff --git a/tests/worker_tests.rs b/tests/worker_tests.rs index a79cf23..f8c2ccf 100644 --- a/tests/worker_tests.rs +++ b/tests/worker_tests.rs @@ -320,12 +320,9 @@ async fn test_worker_runner_process_available_jobs() -> Result<(), BackfillError enqueue_fast(client, SimpleTestJob::IDENTIFIER, &job2, None).await?; enqueue_fast(client, SimpleTestJob::IDENTIFIER, &job3, None).await?; - // Process all available jobs - let processed = worker.process_available_jobs().await?; - - // Note: Returns 0 because job counting isn't implemented yet - // But the jobs should still be processed successfully - assert_eq!(processed, 0); + // Process all available jobs. Returns () — for job counts, plug in a + // JobComplete hook before building the worker. + worker.process_available_jobs().await?; // Verify jobs were actually processed by checking the database // Jobs should be completed and removed from the queue From 00a6c14878cbaa25f38a90ab6d0d293c272b10c2 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:18:58 -0700 Subject: [PATCH 07/19] fix(dlq): atomic DELETE+UPSERT in process_failed_jobs; honest requeue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two atomicity holes in the DLQ paths: P1-2 (process_failed_jobs): The reaper did UPSERT into backfill_dlq, then DELETE from _private_jobs as two separate statements. A crash between them (or a connection blip during the DELETE) would leave the job in BOTH tables. The code even acknowledged this with "Consider this a partial failure - job is in DLQ but also still in main table." Fixed by collapsing both into a single statement using a writable CTE: WITH deleted AS ( DELETE FROM _private_jobs WHERE id = $1 RETURNING 1 ) INSERT INTO backfill_dlq (...) SELECT $1::bigint, $2::text, ... FROM deleted ON CONFLICT (job_key) WHERE job_key IS NOT NULL DO UPDATE SET ... If DELETE finds no row (another worker beat us to it), `deleted` is empty, the SELECT yields zero rows, and the INSERT runs zero times — natural no-op. If DELETE finds the row, the INSERT either creates a new DLQ row or ON-CONFLICT-updates an existing one (the requeue-then-fail-again case). The whole thing commits atomically. `rows_affected() > 0` distinguishes "actually moved" from "raced and skipped" for the moved-count metric. Also adds explicit type casts on the SELECT placeholders since they no longer have the INSERT VALUES position to disambiguate types from. P1-3 (requeue_dlq_job): Operations are enqueue (via WorkerUtils::add_raw_job, which hard-codes the pool) followed by UPDATE backfill_dlq. Can't be wrapped in a single transaction without forking add_raw_job. Smaller honest fix: demote the post-enqueue UPDATE error from `?`-propagation to a WARN log and document the residual non-atomicity in the function docstring. Rationale: when the enqueue succeeds, the user's stated intent — "put this job back in the queue" — has been achieved. Failing the operation because the bookkeeping UPDATE blipped would mislead the user into thinking the requeue didn't happen, and possibly trigger a double-requeue (which then errors with AlreadyInProgress). Better to return Ok with a log than to return Err and confuse the caller. No new tests: the atomicity guarantees are not directly observable without a chaos hook. The existing DLQ tests continue to verify happy-path correctness. --- src/client/dlq.rs | 97 +++++++++++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 33 deletions(-) diff --git a/src/client/dlq.rs b/src/client/dlq.rs index 7c13b01..55571e9 100644 --- a/src/client/dlq.rs +++ b/src/client/dlq.rs @@ -322,6 +322,18 @@ impl BackfillClient { } /// Requeue a job from the DLQ back to the main queue. + /// + /// Two SQL operations happen back-to-back: + /// 1. Enqueue the job (via `WorkerUtils::add_raw_job`). + /// 2. UPDATE the DLQ row's `requeued_count`, `last_requeued_at`, `notes`. + /// + /// `WorkerUtils::add_raw_job` hard-codes the pool, so we can't bundle + /// these into a single transaction. To prefer truthful return values, + /// step 2 failures are logged at WARN level rather than failing the + /// operation: a successful enqueue followed by a failed bookkeeping + /// UPDATE returns `Ok(job)` (the job IS in the queue) with a stale + /// `requeued_count` in the DLQ admin view. The next refresh of the row + /// will show the discrepancy if it matters. pub async fn requeue_dlq_job(&self, dlq_id: i64, notes: Option) -> Result { // Get the DLQ job let dlq_job = self @@ -384,11 +396,24 @@ impl BackfillClient { self.schema ); - sqlx::query(&update_query) + // Bookkeeping update — see function docstring. If this fails, the + // job has already been re-enqueued; we return success but log so an + // operator can see that the DLQ row's counters didn't advance. + if let Err(e) = sqlx::query(&update_query) .bind(¬es) .bind(dlq_id) .execute(&self.pool) - .await?; + .await + { + log::warn!( + "DLQ row bookkeeping update failed after successful requeue \ + (dlq_id: {}, job_id: {}, error: {}); requeued_count/last_requeued_at \ + may be stale", + dlq_id, + job.id(), + e + ); + } Ok(*job) } @@ -652,14 +677,26 @@ impl BackfillClient { // Convert last_error from TEXT to JSONB for DLQ table let last_error_json = last_error.map(serde_json::Value::String); - // Move to DLQ using UPSERT to handle requeued jobs that fail again - let upsert_dlq_query = format!( + // Atomic DELETE + UPSERT in a single statement using a writable + // CTE. Either the row is gone from `_private_jobs` AND in + // `backfill_dlq`, or neither happened — no partial-failure window + // where the job ends up in both tables. The INSERT's SELECT + // sources from `deleted`, so when DELETE finds nothing + // (e.g., another worker already moved the row) the INSERT runs + // zero times. + let move_query = format!( r#" + WITH deleted AS ( + DELETE FROM {schema}._private_jobs WHERE id = $1 RETURNING 1 + ) INSERT INTO {schema}.backfill_dlq ( original_job_id, task_identifier, payload, queue_name, priority, job_key, max_attempts, failure_reason, failure_count, last_error, original_created_at, original_run_at - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12) + ) + SELECT $1::bigint, $2::text, $3::jsonb, $4::text, $5::int, $6::text, + $7::int, $8::text, $9::int, $10::jsonb, $11::timestamptz, $12::timestamptz + FROM deleted ON CONFLICT (job_key) WHERE job_key IS NOT NULL DO UPDATE SET failed_at = NOW(), failure_count = {schema}.backfill_dlq.failure_count + EXCLUDED.failure_count, @@ -672,12 +709,12 @@ impl BackfillClient { let failure_reason = format!("Job exceeded maximum retry attempts ({}/{})", attempts, max_attempts); - let upsert_result = sqlx::query(&upsert_dlq_query) + let move_result = sqlx::query(&move_query) .bind(job_id) .bind(&task_identifier) .bind(&payload) .bind(&queue_name) - .bind(priority) + .bind(priority as i32) .bind(&job_key) .bind(max_attempts as i32) .bind(failure_reason) @@ -688,36 +725,30 @@ impl BackfillClient { .execute(&self.pool) .await; - match upsert_result { + match move_result { + Ok(result) if result.rows_affected() > 0 => { + moved_count += 1; + log::info!( + "Successfully moved failed job to DLQ (job_id: {}, task: {}, attempts: {}/{})", + job_id, + task_identifier, + attempts, + max_attempts + ); + } Ok(_) => { - // Successfully moved to DLQ, now remove from main jobs table - let delete_query = format!("DELETE FROM {}._private_jobs WHERE id = $1", self.schema); - match sqlx::query(&delete_query).bind(job_id).execute(&self.pool).await { - Ok(_) => { - moved_count += 1; - log::info!( - "Successfully moved failed job to DLQ (job_id: {}, task: {}, attempts: {}/{})", - job_id, - task_identifier, - attempts, - max_attempts - ); - } - Err(e) => { - log::error!( - "Failed to delete job from main table after DLQ insertion (job_id: {}, task: {}, error: {})", - job_id, - task_identifier, - e - ); - // Consider this a partial failure - job is in DLQ - // but also still in main table - } - } + // DELETE found no row — likely another worker beat us to + // it. Not a problem; the job either already reached DLQ + // via the other worker or is still progressing normally. + log::debug!( + "DLQ move skipped (job_id: {}, task: {}); row no longer present in _private_jobs", + job_id, + task_identifier + ); } Err(e) => { log::error!( - "Failed to insert job into DLQ (job_id: {}, task: {}, error: {})", + "Failed to move job to DLQ (job_id: {}, task: {}, error: {})", job_id, task_identifier, e From bcbcaa4e46843508a55554257280dee76c26731c Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:21:00 -0700 Subject: [PATCH 08/19] fix(metrics): bound queue label cardinality in built-in metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The library was using `Queue::as_str()` (which returns the actual queue name for serial queues) as the `queue=` Prometheus label across `backfill_jobs_enqueued`, `backfill_jobs_already_in_progress`, `backfill_dlq_jobs_added`, and `backfill_dlq_jobs_requeued`. The library's own docs recommend `Queue::serial_for("user", id)` for per-entity ordering — which produces a different queue name per user, which produces a different label value per user, which produces unbounded Prometheus time series. Industry best practice for cardinality-sensitive systems (Prometheus, Datadog, etc.) is to keep label cardinality below ~100; per- user labels easily exceed this by orders of magnitude and can OOM the metrics backend or render dashboards unusable. DLQ-side metrics had a related bug: parallel-origin jobs are stored with queue_name = "" (empty string), which is invalid as a Prometheus label in some backends. Adds `Queue::metric_label() -> &'static str` returning "parallel" or "serial" — bounded, two-valued. Built-in metric emission now uses this. On the DLQ side where we have a stored queue_name string (not a Queue enum), `metrics::queue_metric_label_from_name(&str)` does the same two-valued mapping. `Queue::as_str()` is preserved for logging (where unbounded values are fine) and gets a doc comment warning against using it for metrics. If users want per-queue stats for a known small set of named queues, they can emit them themselves via a plugin — that's the correct place to make the cardinality-vs-detail tradeoff per their own setup. Adds a unit test asserting that `serial_for("user", 12345)` and `serial_for("user", 99999)` produce the same label. --- src/client/dlq.rs | 11 +++++++++-- src/client/enqueue.rs | 10 ++++++---- src/lib.rs | 33 +++++++++++++++++++++++++++++++-- src/metrics.rs | 9 +++++++++ 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/src/client/dlq.rs b/src/client/dlq.rs index 55571e9..e32335f 100644 --- a/src/client/dlq.rs +++ b/src/client/dlq.rs @@ -375,7 +375,8 @@ impl BackfillClient { }; // Record metrics - crate::metrics::record_dlq_job_requeued(&dlq_job.task_identifier, spec.queue.as_str()); + // Use bounded metric_label() to keep label cardinality small. + crate::metrics::record_dlq_job_requeued(&dlq_job.task_identifier, spec.queue.metric_label()); log::info!( "Job requeued from DLQ (dlq_id: {}, job_id: {}, task: {})", @@ -615,7 +616,13 @@ impl BackfillClient { // Record metrics crate::metrics::record_db_operation("dlq_add", "success"); crate::metrics::record_db_operation_duration("dlq_add", start.elapsed().as_secs_f64()); - crate::metrics::record_dlq_job_added(&dlq_job.queue_name, &dlq_job.task_identifier, &dlq_job.failure_reason); + // Bounded label: "parallel" / "serial" rather than the raw queue name + // (which may be "" for parallel-origin jobs and unbounded for serial). + crate::metrics::record_dlq_job_added( + crate::metrics::queue_metric_label_from_name(&dlq_job.queue_name), + &dlq_job.task_identifier, + &dlq_job.failure_reason, + ); log::info!( "Job moved to DLQ (dlq_id: {}, task: {}, failure_reason: {})", diff --git a/src/client/enqueue.rs b/src/client/enqueue.rs index 85a5864..5f74ec3 100644 --- a/src/client/enqueue.rs +++ b/src/client/enqueue.rs @@ -101,8 +101,10 @@ impl BackfillClient { crate::metrics::record_db_operation("enqueue", "success"); crate::metrics::record_db_operation_duration("enqueue", start.elapsed().as_secs_f64()); - // Record job enqueued metric - crate::metrics::record_job_enqueued(spec.queue.as_str(), task_identifier, spec.priority.0); + // Record job enqueued metric. Use the bounded `metric_label()` + // (either "parallel" or "serial") to avoid exploding the label + // cardinality when callers use `Queue::serial_for(entity, id)`. + crate::metrics::record_job_enqueued(spec.queue.metric_label(), task_identifier, spec.priority.0); log::debug!( "Job enqueued (job_id: {}, task: {}, queue: {}, priority: {})", @@ -122,8 +124,8 @@ impl BackfillClient { crate::metrics::record_db_operation("enqueue", "already_in_progress"); crate::metrics::record_db_operation_duration("enqueue", start.elapsed().as_secs_f64()); - // Record the already_in_progress metric - crate::metrics::record_job_already_in_progress(spec.queue.as_str(), task_identifier); + // Record the already_in_progress metric (bounded label) + crate::metrics::record_job_already_in_progress(spec.queue.metric_label(), task_identifier); let key = job_key.clone().unwrap_or_else(|| "".to_string()); log::debug!( diff --git a/src/lib.rs b/src/lib.rs index 905f2b0..6f0cf1f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -235,10 +235,13 @@ impl Queue { } } - /// Returns a string representation for logging/metrics. + /// Returns a string representation for logging. /// /// Returns "parallel" for parallel queues, or the queue name for serial - /// queues. + /// queues. **Do not use this for metric labels** — serial queue names can + /// have unbounded cardinality (e.g., `Queue::serial_for("user", id)` + /// produces a different name per user, which would explode Prometheus + /// time-series storage). Use [`Queue::metric_label`] for metrics. pub fn as_str(&self) -> &str { match self { Queue::Parallel => "parallel", @@ -246,6 +249,20 @@ impl Queue { } } + /// Bounded label suitable for metrics: returns either `"parallel"` or + /// `"serial"`. + /// + /// This drops the queue name. The library uses this for all built-in + /// metric emission (`backfill_jobs_enqueued`, `backfill_dlq_*`, etc.) to + /// keep label cardinality bounded. If you need per-queue metrics for a + /// fixed, small set of named queues, emit them yourself via a plugin. + pub fn metric_label(&self) -> &'static str { + match self { + Queue::Parallel => "parallel", + Queue::Serial(_) => "serial", + } + } + /// Returns true if this is a parallel (non-serialized) queue. pub fn is_parallel(&self) -> bool { matches!(self, Queue::Parallel) @@ -680,6 +697,18 @@ mod tests { assert_eq!(Queue::Parallel.as_str(), "parallel"); } + #[test] + fn queue_metric_label_is_bounded() { + // The whole point of metric_label vs as_str: per-entity serial queues + // (e.g., "user:123") collapse to "serial" instead of carrying the + // unbounded entity id into Prometheus labels. + assert_eq!(Queue::Parallel.metric_label(), "parallel"); + assert_eq!(Queue::serial("anything").metric_label(), "serial"); + assert_eq!(Queue::serial_for("user", 12345).metric_label(), "serial"); + assert_eq!(Queue::serial_for("user", 99999).metric_label(), "serial"); + assert_eq!(Queue::dead_letter().metric_label(), "serial"); + } + #[test] fn queue_serial_creation() { let queue = Queue::serial("test"); diff --git a/src/metrics.rs b/src/metrics.rs index 694cd9d..54a60c2 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -89,6 +89,15 @@ pub(crate) fn record_job_already_in_progress(queue: &str, task: &str) { .increment(1); } +/// Convert a stored DLQ `queue_name` (which may be "" for parallel-origin +/// jobs) into a bounded metric label of either `"parallel"` or `"serial"`. +/// +/// Mirrors [`crate::Queue::metric_label`] but operates on the post-storage +/// string form used by `DlqJob.queue_name`. +pub(crate) fn queue_metric_label_from_name(queue_name: &str) -> &'static str { + if queue_name.is_empty() { "parallel" } else { "serial" } +} + /// Record a job being added to DLQ pub(crate) fn record_dlq_job_added(queue: &str, task: &str, reason: &str) { metrics::counter!( From 513fbbc044156481bdceae6de69e70a9ee6eff9c Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:24:39 -0700 Subject: [PATCH 09/19] fix(worker): make QueueConfig multi-queue API honest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The QueueConfig multi-queue API has been misleading since it was introduced: - WorkerConfig::with_queues(Vec) accepted multiple configs, but only the FIRST one's `concurrency` was ever used at runtime (worker.rs WorkerOptionsBuilder::new). The rest were silently dropped with a single WARN log. - QueueConfig::priority_queue accepted a (min, max) priority_range that was never read by any code path. - QueueConfig::named_queue accepted a queue name that was logged but never applied to graphile_worker's WorkerOptions (which doesn't expose per-worker queue filtering). - worker_count() returned `queue_configs.len()`, lying when callers passed N configs and exactly 1 worker actually ran. This commit makes the surface match reality without removing it (which would be a bigger break): - Adds `WorkerConfig::with_concurrency(usize)` — the clean direct replacement. - Marks `WorkerConfig::with_queues`, `QueueConfig::named_queue`, and `QueueConfig::priority_queue` as `#[deprecated(since = "1.2.0")]` with notes pointing at the working alternatives. The `name` and `priority_range` fields get doc comments saying they aren't honored. - Fixes `worker_count()` to return 1 (the truth) instead of `queue_configs.len()`. - Renames the existing test `test_worker_runner_with_multiple_queues` to `test_worker_runner_with_multiple_queues_only_first_honored` and pins the actual behaviour (`worker_count() == 1`). The original test's `assert_eq!(worker.worker_count(), 3)` was asserting the bug. - Migrates `examples/basic_worker.rs` from the deprecated multi-queue shape to `with_concurrency`. The example previously claimed to configure separate fast/bulk/dead_letter workers but only the first was ever spawned — now it spawns one worker sized for the largest configured concurrency, with a comment explaining the constraint. Users who actually want multiple specialized workers can spawn multiple WorkerRunner instances themselves. Per-job queue routing still works correctly via Queue::serial(name) at enqueue time — that's unchanged. --- examples/basic_worker.rs | 31 ++++++++-------- src/worker.rs | 78 ++++++++++++++++++++++++++++++++++------ tests/worker_tests.rs | 25 ++++++++----- 3 files changed, 102 insertions(+), 32 deletions(-) diff --git a/examples/basic_worker.rs b/examples/basic_worker.rs index eea89d4..846563d 100644 --- a/examples/basic_worker.rs +++ b/examples/basic_worker.rs @@ -16,7 +16,7 @@ use std::num::ParseIntError; use std::time::Duration; use backfill::{ - BackfillError, IntoTaskHandlerResult, QueueConfig, TaskHandler, WorkerConfig, WorkerContext, WorkerError, + BackfillError, IntoTaskHandlerResult, TaskHandler, WorkerConfig, WorkerContext, WorkerError, WorkerRunner, }; use log::{error, info, warn}; @@ -107,19 +107,22 @@ impl ExampleWorkerConfig { impl From for WorkerConfig { fn from(value: ExampleWorkerConfig) -> Self { - WorkerConfig { - database_url: value.database_url, - schema: value.schema, - queue_configs: vec![ - QueueConfig::named_queue("fast", value.fast_concurrency), - QueueConfig::named_queue("bulk", value.bulk_concurrency), - QueueConfig::named_queue("dead_letter", value.dlq_concurrency), - QueueConfig::default_queue(5), // Default queue with moderate concurrency - ], - poll_interval: value.poll_interval, - dlq_processor_interval: Some(value.dlq_processor_interval), - ..Default::default() - } + // A single WorkerRunner spawns one Worker; pick the highest of the + // configured per-queue concurrencies so this worker can keep up with + // any of them. To run truly separate workers per queue, spawn + // multiple WorkerRunner instances yourself — graphile_worker doesn't + // expose per-worker queue filtering. + let concurrency = value + .fast_concurrency + .max(value.bulk_concurrency) + .max(value.dlq_concurrency) + .max(5); + + WorkerConfig::new(value.database_url) + .with_schema(value.schema) + .with_concurrency(concurrency) + .with_poll_interval(value.poll_interval) + .with_dlq_processor_interval(Some(value.dlq_processor_interval)) } } diff --git a/src/worker.rs b/src/worker.rs index deb0142..881b00a 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -101,19 +101,31 @@ use tokio_util::sync::CancellationToken; use crate::{BackfillClient, BackfillError, Plugin, TaskHandler, WorkerOptions}; // Re-export for use in wrapper -/// Configuration for a worker queue +/// Configuration for a worker queue. +/// +/// **Currently only `concurrency` is honored.** `name` and `priority_range` +/// were intended for multi-queue worker setups, but graphile_worker's +/// `WorkerOptions` doesn't expose per-worker queue filtering, so a single +/// `WorkerRunner` runs exactly one worker and consumes only the first +/// `QueueConfig` in `WorkerConfig::queue_configs`. To run multiple workers +/// with different queue specializations, spawn multiple `WorkerRunner`s. #[derive(Debug, Clone)] pub struct QueueConfig { - /// Queue name (None for default queue) + /// Queue name (None for default queue). **Not honored** — the worker + /// processes jobs from any queue regardless of this field. pub name: Option, - /// Number of concurrent jobs to process in this queue + /// Number of concurrent jobs to process. **Honored.** pub concurrency: usize, - /// Priority range for jobs in this queue (inclusive) + /// Priority range for jobs in this queue. **Not honored** — the worker + /// fetches by priority order with no range filter. pub priority_range: Option<(i32, i32)>, } impl QueueConfig { - /// Create configuration for the default queue + /// Create configuration for the default queue with the given concurrency. + /// + /// This is the only `QueueConfig` constructor where every field is + /// actually honored at runtime. pub fn default_queue(concurrency: usize) -> Self { Self { name: None, @@ -122,7 +134,17 @@ impl QueueConfig { } } - /// Create configuration for a named queue + /// Create configuration for a named queue. + /// + /// **Deprecated**: the `name` field is not honored at worker startup — + /// graphile_worker's `WorkerOptions` doesn't filter jobs by queue. Use + /// [`WorkerConfig::with_concurrency`] instead, and route jobs to + /// specific named queues at enqueue time via `Queue::serial(name)`. + #[deprecated( + since = "1.2.0", + note = "queue name is not honored by graphile_worker's WorkerOptions; use WorkerConfig::with_concurrency \ + and route jobs to named queues at enqueue time via Queue::serial(name)" + )] pub fn named_queue(name: impl Into, concurrency: usize) -> Self { Self { name: Some(name.into()), @@ -131,7 +153,16 @@ impl QueueConfig { } } - /// Create configuration for a priority-based queue + /// Create configuration for a priority-based queue. + /// + /// **Deprecated**: neither `name` nor `priority_range` is honored at + /// runtime. The worker fetches jobs by priority order (lower number + /// first) regardless of any range configured here. Use + /// [`WorkerConfig::with_concurrency`] instead. + #[deprecated( + since = "1.2.0", + note = "priority_range is never honored by the worker fetch loop; this constructor stores values that are dead. Use WorkerConfig::with_concurrency." + )] pub fn priority_queue(name: impl Into, concurrency: usize, min_priority: i32, max_priority: i32) -> Self { Self { name: Some(name.into()), @@ -203,7 +234,28 @@ impl WorkerConfig { self } - /// Set queue configurations + /// Set the worker's concurrency. + /// + /// `WorkerRunner` runs a single graphile_worker `Worker` whose + /// `concurrency` setting determines how many jobs it can execute in + /// parallel. To run multiple specialized workers, spawn multiple + /// `WorkerRunner` instances yourself. + pub fn with_concurrency(mut self, concurrency: usize) -> Self { + self.queue_configs = vec![QueueConfig::default_queue(concurrency)]; + self + } + + /// Set queue configurations. + /// + /// **Deprecated**: only the first `QueueConfig`'s `concurrency` is + /// honored — graphile_worker's `WorkerOptions` doesn't expose + /// per-worker queue filtering, so subsequent configs are ignored. Use + /// [`Self::with_concurrency`] instead. Route jobs to named queues at + /// enqueue time via `Queue::serial(name)`. + #[deprecated( + since = "1.2.0", + note = "only the first QueueConfig's concurrency is honored; use with_concurrency() and route via Queue::serial(name) at enqueue time" + )] pub fn with_queues(mut self, queues: Vec) -> Self { self.queue_configs = queues; self @@ -873,9 +925,14 @@ impl WorkerRunner { &self.client } - /// Get the number of worker instances configured + /// Get the number of worker instances actually running. + /// + /// Always returns 1: `WorkerRunner` spawns exactly one graphile_worker + /// `Worker`. The historical `Vec` API allowed callers to + /// pass multiple configs, but only the first was ever used at runtime — + /// this method now reports the truth instead of `queue_configs.len()`. pub fn worker_count(&self) -> usize { - self.config.queue_configs.len() + 1 } /// Check if DLQ processor is enabled @@ -889,6 +946,7 @@ mod tests { use super::*; #[test] + #[allow(deprecated)] // pins the deprecated constructors' storage shape fn test_queue_config_builders() { let default = QueueConfig::default_queue(5); assert_eq!(default.name, None); diff --git a/tests/worker_tests.rs b/tests/worker_tests.rs index f8c2ccf..43875ae 100644 --- a/tests/worker_tests.rs +++ b/tests/worker_tests.rs @@ -45,16 +45,14 @@ async fn test_worker_config_builder() { .with_schema("custom_schema") .with_poll_interval(Duration::from_millis(100)) .with_dlq_processor_interval(Some(Duration::from_secs(30))) - .with_queues(vec![ - QueueConfig::default_queue(5), - QueueConfig::named_queue("fast", 10), - ]); + .with_concurrency(10); assert_eq!(config.database_url, "postgresql://localhost/test"); assert_eq!(config.schema, "custom_schema"); assert_eq!(config.poll_interval, Duration::from_millis(100)); assert_eq!(config.dlq_processor_interval, Some(Duration::from_secs(30))); - assert_eq!(config.queue_configs.len(), 2); + assert_eq!(config.queue_configs.len(), 1); + assert_eq!(config.queue_configs[0].concurrency, 10); } #[tokio::test] @@ -74,6 +72,7 @@ async fn test_queue_config_default_queue() { } #[tokio::test] +#[allow(deprecated)] // documents the deprecated constructor's storage shape async fn test_queue_config_named_queue() { let config = QueueConfig::named_queue("bulk", 20); @@ -83,6 +82,7 @@ async fn test_queue_config_named_queue() { } #[tokio::test] +#[allow(deprecated)] // documents the deprecated constructor's storage shape async fn test_queue_config_priority_queue() { let config = QueueConfig::priority_queue("urgent", 5, -100, 100); @@ -151,7 +151,12 @@ async fn test_worker_runner_multiple_job_types() -> Result<(), BackfillError> { } #[tokio::test] -async fn test_worker_runner_with_multiple_queues() -> Result<(), BackfillError> { +#[allow(deprecated)] // covers the deprecated multi-queue API while it still exists +async fn test_worker_runner_with_multiple_queues_only_first_honored() -> Result<(), BackfillError> { + // The Vec API is deprecated because graphile_worker doesn't + // expose per-worker queue filtering; only one Worker is ever spawned and + // only the first config's concurrency is used. This test pins that + // behaviour: passing 3 configs results in worker_count == 1. let config = WorkerConfig::new(get_test_database_url()) .with_schema("test_worker_queues") .with_queues(vec![ @@ -167,7 +172,11 @@ async fn test_worker_runner_with_multiple_queues() -> Result<(), BackfillError> .build() .await?; - assert_eq!(worker.worker_count(), 3); + assert_eq!( + worker.worker_count(), + 1, + "WorkerRunner only ever spawns one Worker, regardless of queue_configs.len()" + ); Ok(()) } @@ -358,7 +367,7 @@ async fn test_worker_runner_invalid_database_url() { async fn test_worker_options_builder_concurrency() -> Result<(), BackfillError> { let config = WorkerConfig::new(get_test_database_url()) .with_schema("test_concurrency") - .with_queues(vec![QueueConfig::default_queue(15)]) + .with_concurrency(15) .with_dlq_processor_interval(None); let worker = WorkerRunner::builder(config) From 302b142c59425bed35664abfdca4490e06c8c217 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:29:03 -0700 Subject: [PATCH 10/19] chore: small client cleanups (P2-9, P2-12, P2-13) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three minor improvements bundled together since each is a few lines: P2-9 (lib.rs enqueue_emergency): drop the redundant `run_at = Some(Utc::now())`. graphile_worker resolves a NULL run_at to NOW() at the SQL layer, so setting it explicitly was just an extra Rust-side clock read with no behavioural difference. Removing it also avoids a tiny clock-skew window where the Rust-computed timestamp could end up a hair in the past. P2-12 (client/enqueue.rs enqueue): the duration histogram was emitted on the success and already-in-progress branches but not the plain-error branch, biasing `backfill_db_operation_duration_seconds` toward fast successes. Move the duration recording to fire once after the call completes so all three outcomes contribute to the histogram. P2-13 (client/dlq.rs delete_dlq_job): replace the "fetch first, then delete" pattern with a single `DELETE … RETURNING task_identifier`. Halves DB round-trips on a hot admin path and removes the small race where the row could vanish between the SELECT and DELETE. --- src/client/dlq.rs | 28 ++++++++++++++-------------- src/client/enqueue.rs | 7 ++++--- src/lib.rs | 4 +++- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/src/client/dlq.rs b/src/client/dlq.rs index e32335f..4185d60 100644 --- a/src/client/dlq.rs +++ b/src/client/dlq.rs @@ -421,24 +421,24 @@ impl BackfillClient { /// Delete a job from the DLQ permanently. pub async fn delete_dlq_job(&self, dlq_id: i64) -> Result { - // Get the job first to record task identifier in metrics - let task_identifier = if let Some(job) = self.get_dlq_job(dlq_id).await? { - Some(job.task_identifier.clone()) - } else { - None - }; - - let query = format!("DELETE FROM {}.backfill_dlq WHERE id = $1", self.schema); - let result = sqlx::query(&query).bind(dlq_id).execute(&self.pool).await?; - - let deleted = result.rows_affected() > 0; + // Single round-trip: DELETE … RETURNING gives us the task_identifier + // for the metric in the same query. Returns None if the row didn't + // exist (no rows deleted). + let query = format!( + "DELETE FROM {}.backfill_dlq WHERE id = $1 RETURNING task_identifier", + self.schema + ); + let task_identifier: Option = sqlx::query_scalar(&query) + .bind(dlq_id) + .fetch_optional(&self.pool) + .await?; - if deleted && let Some(task) = task_identifier { - crate::metrics::record_dlq_job_deleted(&task); + if let Some(task) = &task_identifier { + crate::metrics::record_dlq_job_deleted(task); log::info!("Job deleted from DLQ (dlq_id: {}, task: {})", dlq_id, task); } - Ok(deleted) + Ok(task_identifier.is_some()) } /// Delete DLQ entries by job_key. diff --git a/src/client/enqueue.rs b/src/client/enqueue.rs index 5f74ec3..dc83801 100644 --- a/src/client/enqueue.rs +++ b/src/client/enqueue.rs @@ -95,11 +95,13 @@ impl BackfillClient { .add_raw_job(task_identifier, serde_json::to_value(payload)?, spec.clone().into()) .await; + // Record duration once for every outcome — success, already-in-progress, + // and error — so the histogram isn't biased toward fast successes. + crate::metrics::record_db_operation_duration("enqueue", start.elapsed().as_secs_f64()); + match result { Ok(job) => { - // Record successful enqueue crate::metrics::record_db_operation("enqueue", "success"); - crate::metrics::record_db_operation_duration("enqueue", start.elapsed().as_secs_f64()); // Record job enqueued metric. Use the bounded `metric_label()` // (either "parallel" or "serial") to avoid exploding the label @@ -122,7 +124,6 @@ impl BackfillClient { if is_row_not_found(&e) && job_key.is_some() { // Job with this key is already locked/in progress crate::metrics::record_db_operation("enqueue", "already_in_progress"); - crate::metrics::record_db_operation_duration("enqueue", start.elapsed().as_secs_f64()); // Record the already_in_progress metric (bounded label) crate::metrics::record_job_already_in_progress(spec.queue.metric_label(), task_identifier); diff --git a/src/lib.rs b/src/lib.rs index 6f0cf1f..7eaa18b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -541,9 +541,11 @@ pub async fn enqueue_emergency( where T: Serialize, { + // run_at defaults to None, which graphile_worker resolves to NOW() at the + // SQL layer — equivalent to "execute immediately" without the extra Rust- + // side clock read. let spec = JobSpec { priority: Priority::EMERGENCY, - run_at: Some(Utc::now()), // Execute immediately job_key, ..Default::default() }; From c4c34b57c5d31989f43274019ea3cb8609989498 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:29:39 -0700 Subject: [PATCH 11/19] chore(client): remove orphan queries.rs documentation stub MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src/client/queries.rs was never `mod`-declared in src/client/mod.rs (or anywhere else), so the Rust compiler ignored it entirely. Its rustdoc-as-documentation pattern can't even render via `cargo doc` because it's not part of the module tree. The defined `JobInfo` / `DlqJobInfo` structs were never instantiated. Its purpose was to demonstrate the sqlx::query!() macro pattern for users wanting compile-time-checked queries — but `docs/01-database-setup.md` already covers SQLx setup (with `cargo sqlx prepare` for offline mode), so the duplicated guidance has no unique value. Delete the file. If we want to ship a real query!() recipe, we'll do it in proper user-facing docs and back it with actual code that exercises the macros. --- src/client/queries.rs | 118 ------------------------------------------ 1 file changed, 118 deletions(-) delete mode 100644 src/client/queries.rs diff --git a/src/client/queries.rs b/src/client/queries.rs deleted file mode 100644 index 8af0299..0000000 --- a/src/client/queries.rs +++ /dev/null @@ -1,118 +0,0 @@ -//! Compile-time checked SQLx queries -//! -//! This module contains examples of how to use SQLx macros for compile-time -//! verification of database queries. These provide type safety and guarantee -//! that your queries will work at runtime. -//! -//! ## Setup Requirements -//! -//! To use compile-time query verification, you need either: -//! -//! ### Option 1: Online Mode (Recommended) -//! Set the `DATABASE_URL` environment variable: -//! ```bash -//! export DATABASE_URL="postgresql://user:password@localhost:5432/database" -//! ``` -//! -//! ### Option 2: Offline Mode -//! Generate query metadata using: -//! ```bash -//! cargo sqlx prepare -//! ``` -//! This creates `.sqlx/sqlx-data.json` with query metadata. -//! -//! ## Usage Example -//! -//! ```rust,no_run -//! use sqlx::PgPool; -//! use crate::BackfillError; -//! -//! /// Example of compile-time checked query -//! pub async fn get_job_count_safe(pool: &PgPool) -> Result { -//! let result = sqlx::query!( -//! "SELECT COUNT(*) as count FROM graphile_worker.jobs" -//! ) -//! .fetch_one(pool) -//! .await?; -//! -//! Ok(result.count.unwrap_or(0)) -//! } -//! -//! /// Example with parameters -//! pub async fn get_jobs_by_task_safe( -//! pool: &PgPool, -//! task_name: &str -//! ) -> Result, BackfillError> { -//! let rows = sqlx::query!( -//! r#" -//! SELECT id, task_identifier, queue_name, priority, created_at -//! FROM graphile_worker.jobs -//! WHERE task_identifier = $1 -//! ORDER BY created_at DESC -//! "#, -//! task_name -//! ) -//! .fetch_all(pool) -//! .await?; -//! -//! let jobs = rows.into_iter().map(|row| JobInfo { -//! id: row.id, -//! task_identifier: row.task_identifier, -//! queue_name: row.queue_name, -//! priority: row.priority, -//! created_at: row.created_at, -//! }).collect(); -//! -//! Ok(jobs) -//! } -//! ``` -//! -//! ## Benefits -//! -//! - **Type Safety**: Column names and types are verified at compile time -//! - **Runtime Safety**: Guaranteed that queries will execute successfully -//! - **Refactoring Safety**: Schema changes cause compile errors, not runtime errors -//! - **Performance**: No runtime query parsing overhead -//! -//! ## Migration Strategy -//! -//! For existing projects, migrate gradually: -//! 1. Start with high-risk queries (production critical paths) -//! 2. Use `sqlx::query!()` for SELECT queries first -//! 3. Move to `sqlx::query_as!()` for complex mappings -//! 4. Convert INSERT/UPDATE/DELETE queries last -//! -//! ## CI/CD Integration -//! -//! Add to your CI pipeline: -//! ```yaml -//! # GitHub Actions example -//! - name: Check SQLx queries -//! run: cargo sqlx prepare --check -//! env: -//! DATABASE_URL: ${{ secrets.DATABASE_URL }} -//! ``` - -use chrono::{DateTime, Utc}; - -/// Example job information structure -#[derive(Debug)] -pub struct JobInfo { - pub id: i64, - pub task_identifier: String, - pub queue_name: Option, - pub priority: i32, - pub created_at: DateTime, -} - -/// Example DLQ information structure for compile-time queries -#[derive(Debug)] -pub struct DlqJobInfo { - pub id: i64, - pub task_identifier: String, - pub failed_at: DateTime, - pub failure_count: i32, -} - -// Note: Actual query implementations would go here if DATABASE_URL is available -// For now, this serves as documentation and examples for users to implement From d55094f51f7a9d2388762103f786fac3b36fcdb5 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:30:25 -0700 Subject: [PATCH 12/19] perf(dlq): use NOT EXISTS instead of NOT IN in process_failed_jobs The "find failed jobs not yet in DLQ" predicate used `id NOT IN (SELECT COALESCE(original_job_id, -1) FROM backfill_dlq)`. The COALESCE was a correct workaround for NOT IN's NULL-poisons-everything semantics, but: - NOT EXISTS doesn't have the NULL trap (correlated check, not a list membership test), so the COALESCE workaround disappears. - NOT EXISTS can be planned as an anti-join in PostgreSQL; NOT IN against a subquery typically can't (especially when the inner column is nullable, even with COALESCE), and degrades into per-row hashing/lookups on large DLQs. Behaviour is identical for the current data (any DLQ row with NULL original_job_id was created via add_to_dlq for a job we don't have an id for, which can never match a `_private_jobs.id` anyway). At scale this should plan and run noticeably faster. --- src/client/dlq.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/client/dlq.rs b/src/client/dlq.rs index 4185d60..94d21d5 100644 --- a/src/client/dlq.rs +++ b/src/client/dlq.rs @@ -642,23 +642,33 @@ impl BackfillClient { /// Returns the number of jobs moved to the DLQ. pub async fn process_failed_jobs(&self) -> Result { // Find jobs that have failed permanently (attempts >= max_attempts) - // and haven't been processed yet + // and aren't already in the DLQ. + // + // NOT EXISTS is preferred over NOT IN here: it scales better at large + // DLQ sizes (PostgreSQL can hash-anti-join), and it handles NULL + // values in `backfill_dlq.original_job_id` cleanly without the + // COALESCE workaround that NOT IN requires (NULL in a NOT IN list + // makes the whole predicate UNKNOWN, which silently filters + // everything out). let find_failed_jobs_query = format!( r#" SELECT jobs.id, tasks.identifier AS task_identifier, job_queues.queue_name, jobs.priority, jobs.key as job_key, jobs.max_attempts, jobs.attempts, jobs.last_error, jobs.created_at, jobs.run_at, jobs.updated_at, jobs.payload - FROM {}._private_jobs AS jobs - INNER JOIN {}._private_tasks AS tasks ON tasks.id = jobs.task_id - LEFT JOIN {}._private_job_queues AS job_queues ON job_queues.id = jobs.job_queue_id + FROM {schema}._private_jobs AS jobs + INNER JOIN {schema}._private_tasks AS tasks ON tasks.id = jobs.task_id + LEFT JOIN {schema}._private_job_queues AS job_queues ON job_queues.id = jobs.job_queue_id WHERE jobs.attempts >= jobs.max_attempts AND jobs.max_attempts > 0 - AND jobs.id NOT IN (SELECT COALESCE(original_job_id, -1) FROM {}.backfill_dlq) + AND NOT EXISTS ( + SELECT 1 FROM {schema}.backfill_dlq dlq + WHERE dlq.original_job_id = jobs.id + ) ORDER BY jobs.updated_at ASC LIMIT 100 "#, - self.schema, self.schema, self.schema, self.schema + schema = self.schema ); let failed_jobs = sqlx::query(&find_failed_jobs_query).fetch_all(&self.pool).await?; From 2ba74d43fe6edb83fa77b26dc1a81d7efbfffbb7 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:31:50 -0700 Subject: [PATCH 13/19] fix(dlq): exponential backoff in DLQ processor on consecutive errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DLQ processor's background task ticked at a fixed interval regardless of error history. If process_failed_jobs() persistently errored (DB blip, auth issue, schema drift) the loop would log an ERROR every `interval` seconds — spammy and contributing nothing useful. The original code acknowledged this gap with a TODO-shaped comment. Replace the tokio::interval-based loop with a manual sleep loop that applies exponential backoff on consecutive errors: - interval, 2x, 4x, 8x, 16x, 32x — capped at 32x (so default 60s interval caps at ~32min between attempts under sustained failure). - Any success resets the counter back to a 1x interval. Behaviour on the happy path is unchanged: first scan still runs immediately on spawn, subsequent scans run every `interval`. Cancellation remains responsive (the `biased` select! makes the cancel branch win ties so a slow-firing cancel doesn't get queued behind a long sleep). --- src/client/dlq.rs | 67 +++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/src/client/dlq.rs b/src/client/dlq.rs index 94d21d5..0ea0fa2 100644 --- a/src/client/dlq.rs +++ b/src/client/dlq.rs @@ -783,14 +783,17 @@ impl BackfillClient { /// Process failed jobs continuously in a background task. /// - /// This spawns a background task that periodically scans for failed jobs - /// and moves them to the DLQ. The task runs until the provided cancellation - /// token is triggered. + /// Spawns a background task that periodically scans for failed jobs and + /// moves them to the DLQ. Runs until the cancellation token is triggered. + /// + /// On consecutive errors the wait between scans grows exponentially — + /// `interval`, `2*interval`, `4*interval`, … capped at `32*interval` — + /// so a transient outage doesn't produce log spam at fixed cadence. + /// On any successful scan the wait resets to `interval`. /// /// # Arguments - /// * `interval` - How often to scan for failed jobs - /// * `cancellation_token` - Token to signal when to stop the background - /// task + /// * `interval` - How often to scan for failed jobs (steady state) + /// * `cancellation_token` - Signals when to stop the background task /// /// # Returns /// A JoinHandle for the background task @@ -799,36 +802,56 @@ impl BackfillClient { interval: std::time::Duration, cancellation_token: tokio_util::sync::CancellationToken, ) -> tokio::task::JoinHandle<()> { + // 2^5 = 32x interval cap. With the default 60s interval that's ~32min + // between attempts under sustained failure, which is far enough apart + // to be quiet but close enough to still recover quickly when the + // underlying issue clears. + const MAX_BACKOFF_SHIFT: u32 = 5; + let client = self.clone(); tokio::spawn(async move { - let mut interval_timer = tokio::time::interval(interval); - interval_timer.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - log::info!( "Starting DLQ processor background task (interval_seconds: {})", interval.as_secs() ); + let mut consecutive_errors: u32 = 0; + loop { + // Run the scan first (immediate on first iteration, then after + // each subsequent wait — matches the prior tokio::interval + // behaviour where the first tick fires at t=0). + let mut next_wait = interval; + match client.process_failed_jobs().await { + Ok(count) => { + if count > 0 { + log::info!("DLQ processor moved failed jobs (moved_jobs: {})", count); + } + consecutive_errors = 0; + } + Err(e) => { + consecutive_errors = consecutive_errors.saturating_add(1); + let shift = consecutive_errors.min(MAX_BACKOFF_SHIFT); + let factor = 1u32 << shift; + next_wait = interval * factor; + log::error!( + "DLQ processor encountered error (consecutive: {}, next attempt in: {:?}): {}", + consecutive_errors, + next_wait, + e + ); + } + } + + // Wait or shut down. Cancellation always wins ties. tokio::select! { + biased; _ = cancellation_token.cancelled() => { log::info!("DLQ processor shutting down"); break; } - _ = interval_timer.tick() => { - match client.process_failed_jobs().await { - Ok(count) if count > 0 => { - log::info!("DLQ processor moved failed jobs (moved_jobs: {})", count); - } - Ok(_) => { - // No jobs moved, no need to log - } - Err(e) => { - log::error!("DLQ processor encountered error: {}", e); - } - } - } + _ = tokio::time::sleep(next_wait) => {} } } }) From 370be244e96025498e95f22420a2ee6c414b454f Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:34:41 -0700 Subject: [PATCH 14/19] fix(dlq): make failure_count a touch counter, not cumulative attempts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous UPSERT computed `failure_count = old + EXCLUDED.failure_count` where EXCLUDED.failure_count was the failing job's `attempts` value at the moment of capture. Net semantics were "cumulative handler-failure invocations summed across all retry cycles" — defensible but ambiguous, and easily confused with `attempts` on the original job, with `requeued_count` (admin requeue counter), or with the per-cycle attempt count. Switch to a clean touch-count semantics: `failure_count = N` means "this logical job (by job_key) has reached the DLQ N times, regardless of how many handler invocations each cycle involved." Initial INSERT sets it to 1; the ON CONFLICT branch increments by 1. For jobs without a job_key the UPSERT path doesn't apply, so each failure creates a fresh row with failure_count=1. Updated: - The bind list in `add_to_dlq` (loses the `attempts` bind for that field; failure_count is now hard-coded to 1 in the SQL VALUES). - The SELECT in `process_failed_jobs`'s atomic CTE (same change). - `DlqJob::failure_count` rustdoc to explain the new contract clearly. - Existing test `test_dlq_add_job_and_retrieve` (was asserting failure_count == job.attempts(), which was 0 for a freshly-enqueued job — coincidentally always-passing under the old semantics). Adds `test_dlq_failure_count_is_touch_count`: verifies that two add_to_dlq calls with the same job_key produce a single DLQ row with failure_count progressing 1 → 2. --- src/client/dlq.rs | 36 +++++++++++++++--------- tests/dlq_tests.rs | 69 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 91 insertions(+), 14 deletions(-) diff --git a/src/client/dlq.rs b/src/client/dlq.rs index 0ea0fa2..5cd836b 100644 --- a/src/client/dlq.rs +++ b/src/client/dlq.rs @@ -32,7 +32,13 @@ pub struct DlqJob { pub max_attempts: Option, /// Human-readable failure reason pub failure_reason: String, - /// Number of times the job failed + /// How many times this logical job (by `job_key`) has reached the DLQ. + /// + /// For jobs with a `job_key`, the DLQ row is upserted: a job that fails, + /// gets requeued by an admin, and fails again touches the same DLQ row + /// twice — `failure_count` increments by 1 each time. For jobs without + /// a `job_key` every failure creates a fresh DLQ row, so this always + /// reads `1`. pub failure_count: i32, /// Last error details as JSON pub last_error: Option, @@ -554,26 +560,29 @@ impl BackfillClient { String::new() }; - // Use UPSERT to handle the case where a requeued job fails again. - // If a DLQ entry with the same job_key already exists, update it - // instead of creating a duplicate. This ensures one DLQ entry per - // logical job and keeps failed_at current for cooldown calculations. + // UPSERT to handle the case where a requeued job fails again. If a + // DLQ entry with the same job_key already exists, update it instead + // of creating a duplicate — keeps `failed_at` current and counts the + // touch in `failure_count`. + // + // `failure_count` is a count of DLQ-touch events for this logical job + // (1 for first DLQ landing, +1 each subsequent requeue-then-fail). + // It is NOT a cumulative count of handler-failure invocations. let upsert_query = format!( r#" - INSERT INTO {}.backfill_dlq ( + INSERT INTO {schema}.backfill_dlq ( original_job_id, task_identifier, payload, queue_name, priority, job_key, max_attempts, failure_reason, failure_count, last_error, original_created_at, original_run_at - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12) + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 1, $9, $10, $11) ON CONFLICT (job_key) WHERE job_key IS NOT NULL DO UPDATE SET failed_at = NOW(), - failure_count = {schema}.backfill_dlq.failure_count + EXCLUDED.failure_count, + failure_count = {schema}.backfill_dlq.failure_count + 1, failure_reason = EXCLUDED.failure_reason, last_error = EXCLUDED.last_error, original_job_id = EXCLUDED.original_job_id RETURNING * "#, - self.schema, schema = self.schema ); @@ -586,7 +595,6 @@ impl BackfillClient { .bind(original_job.key()) .bind(original_job.max_attempts()) .bind(failure_reason) - .bind(original_job.attempts()) .bind(&last_error) .bind(original_job.created_at()) .bind(original_job.run_at()) @@ -701,6 +709,9 @@ impl BackfillClient { // sources from `deleted`, so when DELETE finds nothing // (e.g., another worker already moved the row) the INSERT runs // zero times. + // failure_count semantics: count of DLQ-touch events for this job + // (1 on first move, +1 each subsequent requeue-fail). See + // DlqJob::failure_count docstring. let move_query = format!( r#" WITH deleted AS ( @@ -712,11 +723,11 @@ impl BackfillClient { original_created_at, original_run_at ) SELECT $1::bigint, $2::text, $3::jsonb, $4::text, $5::int, $6::text, - $7::int, $8::text, $9::int, $10::jsonb, $11::timestamptz, $12::timestamptz + $7::int, $8::text, 1, $9::jsonb, $10::timestamptz, $11::timestamptz FROM deleted ON CONFLICT (job_key) WHERE job_key IS NOT NULL DO UPDATE SET failed_at = NOW(), - failure_count = {schema}.backfill_dlq.failure_count + EXCLUDED.failure_count, + failure_count = {schema}.backfill_dlq.failure_count + 1, failure_reason = EXCLUDED.failure_reason, last_error = EXCLUDED.last_error, original_job_id = EXCLUDED.original_job_id @@ -735,7 +746,6 @@ impl BackfillClient { .bind(&job_key) .bind(max_attempts as i32) .bind(failure_reason) - .bind(attempts as i32) .bind(&last_error_json) .bind(created_at) .bind(run_at) diff --git a/tests/dlq_tests.rs b/tests/dlq_tests.rs index 2227d45..634c8ff 100644 --- a/tests/dlq_tests.rs +++ b/tests/dlq_tests.rs @@ -194,7 +194,9 @@ async fn test_dlq_add_job_and_retrieve() { assert_eq!(dlq_job.task_identifier, "test_job"); assert_eq!(dlq_job.failure_reason, "Test failure reason"); - assert_eq!(dlq_job.failure_count, *job.attempts() as i32); + // failure_count counts DLQ-touch events for this job_key, not handler + // attempts. First touch = 1 regardless of how many retries the job had. + assert_eq!(dlq_job.failure_count, 1); // Retrieve it let retrieved = client.get_dlq_job(dlq_job.id).await.expect("should retrieve"); @@ -704,6 +706,71 @@ async fn test_dlq_with_different_priorities() { assert!(priorities.contains(&5)); // BULK_LOW } +/// Regression test for P2-7: `failure_count` is a DLQ-touch counter, not a +/// cumulative handler-attempts counter. +/// +/// First DLQ touch for a given `job_key` should set failure_count = 1. +/// A subsequent UPSERT (same job_key, different `_private_jobs` id — +/// simulating a requeue-then-fail-again cycle) should increment to 2. +#[tokio::test] +async fn test_dlq_failure_count_is_touch_count() { + let client = setup_test_client("dlq_failure_count").await; + client.init_dlq().await.expect("DLQ init"); + + let job_data = TestJob { + message: "touch counter test".to_string(), + number: 7, + }; + + // First failure cycle: enqueue with a job_key, add to DLQ. + let outcome = client + .enqueue( + "touch_test", + &job_data, + JobSpec { + job_key: Some("touch_count_key".to_string()), + ..Default::default() + }, + ) + .await + .expect("first enqueue"); + let first = outcome.unwrap(); + let dlq_first = client + .add_to_dlq(&first, "first cycle failure", None) + .await + .expect("first add_to_dlq"); + assert_eq!(dlq_first.failure_count, 1, "first DLQ touch should be 1"); + + // Simulate a requeue-then-fail-again. Re-enqueueing with the same key + // (job_key_mode Replace) creates a fresh _private_jobs row; adding that + // to DLQ should UPSERT into the existing DLQ row by job_key. + let outcome2 = client + .enqueue( + "touch_test", + &job_data, + JobSpec { + job_key: Some("touch_count_key".to_string()), + ..Default::default() + }, + ) + .await + .expect("second enqueue"); + let second = outcome2.unwrap(); + let dlq_second = client + .add_to_dlq(&second, "second cycle failure", None) + .await + .expect("second add_to_dlq"); + + assert_eq!( + dlq_first.id, dlq_second.id, + "same job_key should map to the same DLQ row via UPSERT" + ); + assert_eq!( + dlq_second.failure_count, 2, + "second DLQ touch should increment to 2" + ); +} + /// Regression test for P1-1: `list_dlq_jobs.total` must reflect the filtered /// row count, not the unfiltered table size. /// From f30360d3e9afc9cc6642a3a851039654d635260b Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:42:18 -0700 Subject: [PATCH 15/19] docs: documentation cleanups (P2-5, P2-8, P2-11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P2-5 (Cargo.toml): graphile_worker version pin warning. The default ^0.11.1 constraint already excludes 0.12.x, but a patch release within 0.11.x can change the internal `_private_*` schema that backfill queries directly (cleanup, DLQ scanner, admin endpoints). Add a comment near the dep flagging this coupling so anyone running `cargo update` knows to re-run integration tests against any new graphile_worker version. Not tightening the version range (e.g., to `=0.11.4`) because that would prevent picking up upstream bug fixes; the test suite is the right backstop here, not a stricter pin. P2-8 (lib.rs EnqueueOutcome rustdoc): the `AlreadyInProgress` variant is a real footgun for callers who chain `.unwrap()` on `enqueue` results. Rewrite the enum's docstring to spell out exactly when it fires, that the new payload is *dropped* (not retried), and that `.unwrap()` / `.expect()` panic on this variant by design. Show the buggy auto-unwrap pattern explicitly so users recognize it in their own code. P2-11 (.sqlx + README + docs/01-database-setup.md): remove the empty `.sqlx/sqlx-data.json` stub and rewrite the SQLx documentation to be truthful. backfill itself uses runtime sqlx::query() — the compile-time sqlx::query!() macros don't fit because backfill targets dynamic schema names (the user's chosen schema). So: - Delete .sqlx/sqlx-data.json (was always `{}` — never populated, never served as an offline-mode cache). - Update README's "SQLx" section to clarify backfill's actual usage (runtime queries, no DATABASE_URL needed at compile time). - Add a callout at the top of docs/01-database-setup.md's SQLx section that the offline-mode setup it describes is for *user* queries against backfill's tables, not for backfill itself. --- .sqlx/sqlx-data.json | 1 - Cargo.toml | 5 +++++ README.md | 28 +++++++++++--------------- docs/01-database-setup.md | 16 ++++++++++++++- src/lib.rs | 42 ++++++++++++++++++++++++++++++++------- 5 files changed, 67 insertions(+), 25 deletions(-) delete mode 100644 .sqlx/sqlx-data.json diff --git a/.sqlx/sqlx-data.json b/.sqlx/sqlx-data.json deleted file mode 100644 index 0967ef4..0000000 --- a/.sqlx/sqlx-data.json +++ /dev/null @@ -1 +0,0 @@ -{} diff --git a/Cargo.toml b/Cargo.toml index 8e6542f..5ffbb03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,11 @@ axum = ["dep:axum"] [dependencies] chrono = { version = "0.4", features = ["serde"] } +# IMPORTANT: backfill queries graphile_worker's internal `_private_*` tables +# directly (cleanup, DLQ scanner, admin endpoints). The `_private_*` prefix +# signals these are internal API. Even within 0.11.x, a patch release that +# touches the internal schema can silently break backfill — re-run the +# integration tests against any new graphile_worker version before shipping. graphile_worker = "0.11.1" graphile_worker_crontab_parser = "0.5.12" log = "0.4.28" diff --git a/README.md b/README.md index 4a46108..72888c7 100644 --- a/README.md +++ b/README.md @@ -141,22 +141,18 @@ When workers crash without graceful shutdown, they can leave locks behind that p **⚠️ Warning:** Setting `stale_job_lock_timeout` too short can cause duplicate job execution if jobs legitimately run longer than the timeout. This can lead to data corruption. -### SQLx Compile-Time Query Verification - -This library uses SQLx's compile-time query verification for production safety. Set `DATABASE_URL` during compilation to enable type-safe, compile-time checked SQL queries: - -```bash -export DATABASE_URL="postgresql://localhost:5432/backfill" -cargo build # Queries verified against actual database schema -``` - -Alternatively, use offline mode with pre-generated query metadata: -```bash -cargo sqlx prepare # Generates .sqlx/sqlx-data.json -cargo build # Uses cached metadata, no database required -``` - -See [Database Setup](docs/01-database-setup.md#sqlx-compile-time-query-verification) for detailed setup instructions and best practices. +### SQLx usage + +Backfill currently uses runtime SQLx queries (`sqlx::query()` / +`sqlx::query_scalar()`) rather than the compile-time-checked +`sqlx::query!()` / `query_as!()` macros. No `DATABASE_URL` is required at +compile time, and there's no `.sqlx/` metadata cache to maintain. Schema +errors surface at runtime (caught by the integration test suite). + +If you write your own SQLx queries against backfill's tables in *your* +application, the compile-time macros are a great fit — see +[Database Setup](docs/01-database-setup.md) for `cargo sqlx prepare` +guidance. ### Automatic Setup diff --git a/docs/01-database-setup.md b/docs/01-database-setup.md index 79ad872..5287c2b 100644 --- a/docs/01-database-setup.md +++ b/docs/01-database-setup.md @@ -218,7 +218,21 @@ let client = BackfillClient::new_with_schema(&url, \"app1_jobs\").await?; let client = BackfillClient::new_with_schema(&url, \"app2_jobs\").await?; ``` -## SQLx Compile-Time Query Verification +## SQLx usage + +> **Note on backfill itself**: backfill uses runtime SQLx queries +> (`sqlx::query()` and friends) rather than the compile-time +> `sqlx::query!()` macros. This is a deliberate choice — backfill's SQL +> targets dynamic schema names (the user's chosen schema, not a fixed +> identifier), which the compile-time macros don't support cleanly. So +> `DATABASE_URL` is **not** required at compile time, there is no `.sqlx/` +> metadata to maintain, and schema mismatches surface at runtime (the +> integration test suite is the safety net). +> +> The setup below is a recommendation for **your own** queries against +> backfill's tables, in code you write that uses SQLx. If you don't write +> any `sqlx::query!()` / `sqlx::query_as!()` macros yourself, you can +> skip this section. This library uses SQLx for all database operations, which provides excellent compile-time verification of SQL queries. This is one of Rust's greatest strengths for database applications. diff --git a/src/lib.rs b/src/lib.rs index 7eaa18b..739f0da 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -281,17 +281,45 @@ impl Queue { /// Outcome of an enqueue operation. /// -/// When enqueueing a job, the result can either be: -/// - `Enqueued(Job)`: The job was successfully created or updated -/// - `AlreadyInProgress { job_key }`: A job with this key is currently locked -/// by a worker +/// `enqueue` returns `Result`. This enum distinguishes +/// the two non-error outcomes: +/// +/// - [`EnqueueOutcome::Enqueued`] — the job was created (or, with +/// `JobKeyMode::Replace`, updated). The boxed `Job` carries the job's +/// id/queue_id/etc. +/// - [`EnqueueOutcome::AlreadyInProgress`] — a job with the same `job_key` +/// was **currently locked by a worker** when we tried to add this one. +/// This is *not* a duplicate-key collision (that's handled by +/// `job_key_mode`); it's a race where the worker grabbed the existing +/// job before our update could land. The new payload was discarded. +/// +/// # ⚠️ Footgun warning +/// +/// Treating every `Ok(_)` return as "the job is enqueued" is wrong. A +/// caller who writes: +/// +/// ```ignore +/// // BUG: silently drops AlreadyInProgress as if it were success +/// let job_id = client.enqueue(...).await?.unwrap().id(); +/// ``` +/// +/// will panic at runtime any time the race fires. Always pattern-match, +/// or use [`EnqueueOutcome::is_already_in_progress`] / [`EnqueueOutcome::job`] +/// to handle the case explicitly. `.unwrap()` and `.expect()` panic on +/// `AlreadyInProgress` by design — they're only safe when you're certain +/// no worker is holding the key. #[derive(Debug, Clone)] pub enum EnqueueOutcome { - /// Job was successfully enqueued (either created or updated) + /// Job was successfully enqueued (either created or updated). Enqueued(Box), /// A job with this key is already in progress (locked by a worker). - /// Contains the job_key that was in conflict. - AlreadyInProgress { job_key: String }, + /// The new payload was **not** stored — your update is lost. Decide in + /// the caller whether to retry, queue a different job, or accept the + /// drop. + AlreadyInProgress { + /// The job_key that conflicted. + job_key: String, + }, } impl EnqueueOutcome { From 683bfb349a031d0d584fb64448e0f808a1573442 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:50:13 -0700 Subject: [PATCH 16/19] chore: P3 polish (cleanup SQL, DLQ-disabled log, re-export note) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P3-1 (cleanup.rs): the stale-lock SQL was format-interpolating the timeout in seconds directly into the query string. Safe (it's a u64) but inconsistent with the parameterized norm everywhere else. Replace `INTERVAL '{timeout_secs} seconds'` with a bound parameter: `($1::bigint * interval '1 second')`. Both `release_stale_queue_locks` and `release_stale_job_locks` updated. Behaviour unchanged; existing tests cover this. P3-2 (cleanup.rs cleanup_permanently_failed_jobs): when the DLQ is disabled this function is the only mechanism that removes failed jobs from `_private_jobs` — and those jobs are gone forever, since there's no DLQ to capture them. Previously logged at INFO regardless. Now the function checks `to_regclass('{schema}.backfill_dlq')` and: - INFO if the DLQ table exists ("just GC, jobs were captured earlier") - WARN with explicit "they cannot be recovered" wording if it doesn't, pointing the operator at `dlq_processor_interval` if they didn't mean to opt out of failure inspection. P3-4 (lib.rs): add a doc note above the `pub use graphile_worker::{...}` block flagging that backfill's public surface transitively includes those types — graphile_worker version bumps that touch any of them constitute breaking changes for backfill even when graphile_worker itself doesn't tag them that way (graphile_worker is pre-1.0). Includes a checklist for what to audit when upgrading. --- src/client/cleanup.rs | 54 +++++++++++++++++++++++++++++++++++-------- src/lib.rs | 20 ++++++++++++++++ 2 files changed, 64 insertions(+), 10 deletions(-) diff --git a/src/client/cleanup.rs b/src/client/cleanup.rs index ab314a8..f8ff160 100644 --- a/src/client/cleanup.rs +++ b/src/client/cleanup.rs @@ -57,18 +57,22 @@ impl BackfillClient { pub async fn release_stale_queue_locks(&self, timeout: Duration) -> Result { let timeout_secs = timeout.as_secs(); + // Bind timeout via parameter rather than format-interpolating it. + // u64 → i64 cast is safe: i64::MAX seconds is ~292 billion years. let query = format!( r#" UPDATE {schema}._private_job_queues SET locked_at = NULL, locked_by = NULL WHERE locked_at IS NOT NULL - AND locked_at < NOW() - INTERVAL '{timeout_secs} seconds' + AND locked_at < NOW() - ($1::bigint * interval '1 second') "#, schema = self.schema, - timeout_secs = timeout_secs ); - let result = sqlx::query(&query).execute(&self.pool).await?; + let result = sqlx::query(&query) + .bind(timeout_secs as i64) + .execute(&self.pool) + .await?; let released = result.rows_affected(); // Always emit metrics (counter increments even for 0) @@ -105,13 +109,15 @@ impl BackfillClient { UPDATE {schema}._private_jobs SET locked_at = NULL, locked_by = NULL WHERE locked_at IS NOT NULL - AND locked_at < NOW() - INTERVAL '{timeout_secs} seconds' + AND locked_at < NOW() - ($1::bigint * interval '1 second') "#, schema = self.schema, - timeout_secs = timeout_secs ); - let result = sqlx::query(&query).execute(&self.pool).await?; + let result = sqlx::query(&query) + .bind(timeout_secs as i64) + .execute(&self.pool) + .await?; let released = result.rows_affected(); // Always emit metrics (counter increments even for 0) @@ -173,10 +179,38 @@ impl BackfillClient { crate::metrics::record_cleanup_failed_jobs_deleted(deleted); if deleted > 0 { - log::info!( - "Cleaned up permanently failed jobs from main queue (count: {})", - deleted - ); + // When the DLQ is enabled, these rows have already been captured + // (either by an earlier `process_failed_jobs()` tick or by the + // synchronous pre-cleanup move in `WorkerRunner` startup) so this + // delete is just garbage collection — INFO is fine. When the DLQ + // is *not* enabled, this delete is the only mechanism removing + // failed jobs from the main queue and they are gone forever: + // surface that loudly so an operator who didn't realize that's + // the consequence can see it in their logs. + // + // Detection uses `to_regclass` which returns NULL if the table + // doesn't exist. Failure of the existence check itself doesn't + // matter — we default to the louder log on uncertainty. + let dlq_oid: Option = sqlx::query_scalar("SELECT to_regclass($1)::text") + .bind(format!("{}.backfill_dlq", self.schema)) + .fetch_one(&self.pool) + .await + .ok() + .flatten(); + + if dlq_oid.is_some() { + log::info!( + "Cleaned up permanently failed jobs from main queue (count: {})", + deleted + ); + } else { + log::warn!( + "Deleted permanently failed jobs from main queue WITHOUT DLQ capture \ + (count: {}); they cannot be recovered. Enable dlq_processor_interval \ + to retain failed jobs for inspection.", + deleted + ); + } } Ok(deleted) diff --git a/src/lib.rs b/src/lib.rs index 739f0da..13def00 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -68,6 +68,26 @@ //! ``` use chrono::{DateTime, Utc}; + +// === Re-exports from `graphile_worker` === +// +// The types below are part of backfill's *public* API surface — users +// importing `backfill::*` see them and write code against them. As a +// consequence, every graphile_worker minor or patch release that touches +// any of these types (renames, signature changes, removal) is a breaking +// change for backfill, even when graphile_worker itself doesn't intend +// it as one (graphile_worker is pre-1.0; minor bumps are allowed to be +// breaking under semver, and patch releases occasionally are too). +// +// When upgrading the graphile_worker dependency: +// 1. Re-run the integration test suite (`cargo nextest run -F axum`). +// 2. Audit this re-export list against the new graphile_worker for any +// rename/removal — those need a corresponding backfill major bump and +// migration note for downstream users. +// 3. Audit `_private_*` schema usage in src/client/dlq.rs, +// src/client/cleanup.rs, and src/admin.rs — graphile_worker reserves +// the right to change those tables across versions. +// // Lifecycle hooks for plugins - new Plugin API with event registration pub use graphile_worker::{ // Event types (for hooks.on() registration) From 79cfb4948b6c117b07660af7ed8e1ff3f175a020 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 19:55:02 -0700 Subject: [PATCH 17/19] test: add end-to-end retry-to-DLQ and concurrent-enqueue coverage Closes two of the test-coverage gaps from section 6 of the review. test_retry_to_exhaustion_then_dlq_via_worker (plugin_tests.rs): Existing DLQ tests SQL-fake exhaustion via UPDATE _private_jobs SET attempts = max_attempts, which skips the entire worker code path. This new test runs the actual handler multiple times via the real worker -- each call increments attempts via get_job's attempts+1, the handler returns Err, fail_job's SQL fires, run_at gets pushed into the future. Between iterations the test fast-forwards run_at = NOW() so we don't sleep through graphile_worker's exp(attempts) second backoff. After max_attempts iterations the job naturally hits permanently-failed state, then process_failed_jobs() moves it to DLQ via the atomic CTE (P1-2). Asserts: the job lands in DLQ with the right task_identifier and job_key, failure_count is 1 (touch counter, P2-7), and _private_jobs is empty afterward (P1-2's atomic move worked). This exercises the seam between every layer added in the P0/P1 work (P0-1 cleanup ordering, P0-3 plugin classification, P1-2 atomic move, P2-7 touch counter) along with graphile_worker's own retry semantics. test_concurrent_enqueue_under_load (integration_tests_clean.rs): Spawns 10 tokio tasks each enqueueing 50 jobs in parallel against the same pool. Verifies that all 500 enqueues succeed and that all 500 rows appear in _private_jobs. Catches the kind of regression that would appear if a future refactor introduced row-level lock contention or deadlock paths in the enqueue SQL. --- tests/integration_tests_clean.rs | 71 ++++++++++++++++++++ tests/plugin_tests.rs | 109 +++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) diff --git a/tests/integration_tests_clean.rs b/tests/integration_tests_clean.rs index 563bb05..09c963d 100644 --- a/tests/integration_tests_clean.rs +++ b/tests/integration_tests_clean.rs @@ -1208,3 +1208,74 @@ async fn test_mixed_parallel_and_serial_jobs() -> Result<()> { }) .await } + +/// Concurrent enqueue stress test (test gap from §6 of the review). +/// +/// Spawns multiple tokio tasks each enqueueing a batch of jobs in parallel. +/// Verifies that: +/// - Every enqueue returns success (no deadlocks, no SQL errors). +/// - All jobs land in `_private_jobs` (no lost rows). +/// - Multiple workers/clients can hammer the enqueue path simultaneously +/// without serializing on database locks. +/// +/// This is the kind of regression test that would catch race conditions +/// or contention bugs introduced by future changes to the enqueue path +/// or to graphile_worker's add_job SQL. +#[tokio::test] +async fn test_concurrent_enqueue_under_load() -> Result<()> { + with_isolated_schema(|client| async move { + const TASKS: usize = 10; + const JOBS_PER_TASK: usize = 50; + const TOTAL: usize = TASKS * JOBS_PER_TASK; + + // Fan out: each task enqueues JOBS_PER_TASK jobs serially within + // itself, but TASKS tasks run concurrently against the same pool. + let mut handles = Vec::with_capacity(TASKS); + for task_id in 0..TASKS { + let client = client.clone(); + let handle = tokio::spawn(async move { + let mut enqueued = 0usize; + for job_id in 0..JOBS_PER_TASK { + let payload = TestJob { + message: format!("stress-{task_id}-{job_id}"), + number: (task_id * JOBS_PER_TASK + job_id) as i32, + }; + let outcome = client + .enqueue("stress_job", &payload, JobSpec::default()) + .await?; + if outcome.is_enqueued() { + enqueued += 1; + } + } + Ok::(enqueued) + }); + handles.push(handle); + } + + // Collect results — any task panic or task error fails the test. + let mut total_enqueued = 0usize; + for handle in handles { + let count = handle + .await + .map_err(|e| BackfillError::WorkerRuntime(format!("task join failed: {e}")))??; + total_enqueued += count; + } + assert_eq!(total_enqueued, TOTAL, "every concurrent enqueue must succeed"); + + // Verify every row landed in _private_jobs. + let row_count: (i64,) = sqlx::query_as(&format!( + "SELECT COUNT(*) FROM {}._private_jobs", + client.schema() + )) + .fetch_one(client.pool()) + .await?; + assert_eq!( + row_count.0 as usize, TOTAL, + "all {} jobs must be persisted in _private_jobs", + TOTAL + ); + + Ok(()) + }) + .await +} diff --git a/tests/plugin_tests.rs b/tests/plugin_tests.rs index 2b3c2ec..4df0356 100644 --- a/tests/plugin_tests.rs +++ b/tests/plugin_tests.rs @@ -447,6 +447,115 @@ async fn test_multiple_plugins_called_in_order() -> Result<()> { .await } +/// End-to-end retry-to-exhaustion → DLQ test exercising the actual worker +/// path (handler runs, graphile_worker reschedules, repeat until +/// max_attempts hits, DLQ scanner moves the row). +/// +/// Existing DLQ tests SQL-fake exhaustion via +/// `UPDATE _private_jobs SET attempts = max_attempts`. This test instead +/// runs the real worker + real handler + real `fail_job` SQL multiple +/// times, fast-forwarding `run_at` between iterations to skip the +/// `exp(attempts)` backoff sleeps that would otherwise slow the test to +/// minutes. The retry-then-DLQ path through every layer (enqueue, get_job, +/// handler, fail_job, run_at scheduling, eventual DLQ capture) is what +/// gets validated. +#[tokio::test] +async fn test_retry_to_exhaustion_then_dlq_via_worker() -> Result<()> { + with_isolated_schema(|client| async move { + // FailJob with should_retry=true returns TemporaryUnavailable + // (retryable) on attempts < 3 and ValidationFailed (non-retryable) + // on the final attempt. PermanentFailurePlugin only acts on the + // JobFail event (will_retry=true), so the retryable failures just + // get rescheduled; the final failure fires JobPermanentlyFail + // because attempts has reached max_attempts naturally. Net effect: + // the job exhausts retries through graphile_worker's normal path. + client + .enqueue( + "fail_job", + &FailJob { should_retry: true }, + JobSpec { + max_attempts: Some(3), + job_key: Some("retry_to_exhaustion_e2e".to_string()), + ..Default::default() + }, + ) + .await?; + + let config = WorkerConfig::new(get_test_database_url()) + .with_schema(client.schema().to_string()) + .with_poll_interval(Duration::from_millis(50)); + let worker = WorkerRunner::builder(config) + .await? + .define_job::() + .build() + .await?; + + // Run the worker repeatedly. After each iteration the failed job's + // `run_at` has been pushed forward by `exp(attempts)` seconds (1s, + // ~3s, ~7s, …). Force it back to NOW() so the next process_available_jobs() + // can pick it up immediately. Bounded loop so a regression that + // breaks the retry path doesn't hang the test forever. + const MAX_ITERATIONS: usize = 10; + let mut exhausted = false; + for _ in 0..MAX_ITERATIONS { + sqlx::query(&format!( + "UPDATE {}._private_jobs \ + SET run_at = NOW() \ + WHERE locked_at IS NULL AND attempts < max_attempts", + client.schema() + )) + .execute(client.pool()) + .await?; + + worker.process_available_jobs().await?; + + // Check if attempts hit max_attempts. + let row: Option<(i16, i16)> = sqlx::query_as(&format!( + "SELECT attempts, max_attempts FROM {}._private_jobs LIMIT 1", + client.schema() + )) + .fetch_optional(client.pool()) + .await?; + if let Some((attempts, max_attempts)) = row + && attempts >= max_attempts + && max_attempts > 0 + { + exhausted = true; + break; + } + } + assert!( + exhausted, + "Worker should have exhausted retries within {} iterations", + MAX_ITERATIONS + ); + + // Now run the DLQ processor. The job should be captured. + let moved = client.process_failed_jobs().await?; + assert_eq!(moved, 1, "exhausted job should move to DLQ on next process_failed_jobs"); + + let dlq = client.list_dlq_jobs(DlqFilter::default()).await?; + assert_eq!(dlq.jobs.len(), 1); + assert_eq!(dlq.jobs[0].task_identifier, "fail_job"); + assert_eq!(dlq.jobs[0].job_key.as_deref(), Some("retry_to_exhaustion_e2e")); + // `failure_count` is the touch counter (P2-7) — one DLQ touch. + assert_eq!(dlq.jobs[0].failure_count, 1); + + // The original job row should be gone from _private_jobs (the DLQ + // move's atomic CTE deleted it — P1-2). + let count: (i64,) = sqlx::query_as(&format!( + "SELECT COUNT(*) FROM {}._private_jobs", + client.schema() + )) + .fetch_one(client.pool()) + .await?; + assert_eq!(count.0, 0, "_private_jobs should be empty after DLQ move"); + + Ok(()) + }) + .await +} + /// Regression test for P0-3: non-retryable `WorkerError` variants must /// short-circuit retries instead of running all the way up to `max_attempts`. /// From 4b7daba5ede8a7e4a49ebece73f076b94feae78c Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 20:00:03 -0700 Subject: [PATCH 18/19] chore: bump version to 1.2.0 --- CHANGELOG.md | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.lock | 2 +- Cargo.toml | 2 +- 3 files changed, 122 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7039085..9267468 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,126 @@ All notable changes to the Backfill project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project will adhere to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) after reaching version 1.0.0. +## [1.2.0] — Production-readiness audit + +A focused bug-fix release driven by a top-to-bottom production-readiness +review of the priority queue, retry, and DLQ subsystems. Several silent +failure modes are corrected, several misleading APIs are deprecated with +migration paths, and two long-tail correctness bugs in the DLQ subsystem +are fixed. Every fix is covered by a regression test that fails on the +prior code. + +### Critical fixes + +- **DLQ no longer loses jobs on worker restart.** `WorkerRunner` startup + used to call `cleanup_permanently_failed_jobs()` (which `DELETE`s rows + with `attempts >= max_attempts`) before the DLQ processor's first tick + — any job that hit max_attempts while the worker was down was silently + deleted instead of captured. The startup sequence now runs + `process_failed_jobs()` synchronously before cleanup whenever the DLQ + is enabled. +- **`WorkerError::is_retryable()` is now wired to actual retry behaviour.** + The classification was previously dead code; a `WorkerError::ValidationFailed` + would retry up to `max_attempts` instead of going straight to DLQ. + A new `PermanentFailurePlugin` (auto-registered when DLQ is enabled) + hooks `JobFail`, classifies the error, and short-circuits retries for + non-retryable variants by setting `attempts = max_attempts`. +- **`process_failed_jobs` now atomically moves rows from `_private_jobs` to + the DLQ.** Previously the UPSERT and DELETE were separate statements; a + crash between them could duplicate the job. Combined into a single + writable-CTE statement. + +### Breaking changes + +- `WorkerRunner::process_available_jobs` returns `Result<(), BackfillError>` + (was `Result` always returning `Ok(0)`). For + job-count instrumentation, register a `JobComplete` / `JobFail` plugin + before building the worker. +- `WorkerRunner::worker_count()` returns `1` truthfully (was + `queue_configs.len()`, which lied any time multi-queue config was passed + — only one worker has ever actually been spawned). + +### Deprecations + +All marked `#[deprecated(since = "1.2.0")]` with migration notes. +Scheduled for removal in `2.0.0`. + +- `RetryPolicy::new(4-arg)`, `with_jitter`, `calculate_delay`, + `calculate_retry_time`, `JobSpec::calculate_retry_time` — graphile_worker + uses a hard-coded `exp(min(attempts, 10))`-second SQL formula for retry + scheduling, so the timing fields on `RetryPolicy` and the math helpers + that operated on them never reached the worker. Only `max_attempts` was + ever honored. Use `RetryPolicy { max_attempts: n, ..Default::default() }` + or the `fast` / `aggressive` / `conservative` presets, which now + differ only in attempt count. +- `WorkerConfig::with_queues`, `QueueConfig::named_queue`, + `QueueConfig::priority_queue` — graphile_worker doesn't expose + per-worker queue filtering; only the first config's `concurrency` + was ever used at runtime, and `priority_range` was never read. Use + `WorkerConfig::with_concurrency(n)` and route jobs to named queues at + enqueue time via `Queue::serial(name)`. + +### Other fixes and improvements + +- `list_dlq_jobs.total` now respects filters (was returning unfiltered + count regardless of filter, breaking paginated admin UIs). +- `Queue::metric_label()` added — bounded `"parallel"`/`"serial"` label + for built-in metrics. Built-in metric emission no longer uses + unbounded queue names (e.g., `Queue::serial_for("user", id)`) as + Prometheus labels. +- DLQ-side metrics emit `"parallel"` instead of empty string for + parallel-origin jobs. +- `requeue_dlq_job` no longer fails the operation when the post-enqueue + bookkeeping `UPDATE` blips; the enqueue (the real intent) succeeded, + so a stale `requeued_count` is logged at `WARN` rather than propagated + as an error. +- DLQ processor uses exponential backoff on consecutive errors + (interval, 2x, 4x, … capped at 32x). Resets on success. +- `process_failed_jobs` switched from `id NOT IN (subquery)` to + `NOT EXISTS` for the DLQ-membership check — better planner behaviour + and no NULL-poisoning workaround needed. +- `failure_count` on DLQ rows is now a clean touch counter (1 on first + DLQ landing, +1 each subsequent UPSERT) instead of cumulative + handler-attempts. +- `cleanup_permanently_failed_jobs` now logs at `WARN` when DLQ is + disabled (those jobs cannot be recovered) and `INFO` when DLQ exists. +- `delete_dlq_job` halved its DB round-trips (single `DELETE … RETURNING`). +- `enqueue` records duration on every outcome, not just success. +- Stale-lock SQL parameterized instead of format-interpolating timeout. +- `enqueue_emergency` no longer redundantly sets `run_at = NOW()`. + +### Documentation + +- Truthful `RetryPolicy` rustdoc — explicit about which fields graphile_worker + honors and which are stored-but-ignored. +- `EnqueueOutcome::AlreadyInProgress` rustdoc rewritten to flag the + footgun of unconditionally `.unwrap()`ing. +- `docs/01-database-setup.md` clarifies that backfill itself uses runtime + SQLx queries; the offline-mode setup is for *user* queries. +- `docs/02-dlq.md` "queue_name shows as default" warning marked resolved + (was fixed in 1.1.1 / PR #9, docs were stale). +- `Cargo.toml` warns about the `_private_*` schema coupling so + `cargo update` doesn't silently break across graphile_worker patch + releases. +- `lib.rs` documents which `graphile_worker` types are re-exported and + the SemVer implications of upstream changes. + +### Internals + +- Removed orphan `src/client/queries.rs` (was never `mod`-declared). +- Removed empty `.sqlx/sqlx-data.json` (compile-time query macros aren't + used). + +### Tests + +- Test count: 107 → 115 (+8). New regression tests for the startup race, + the retryable / non-retryable plugin behaviour, the filtered-pagination + fix, the failure_count touch counter, the bounded metric label, an + end-to-end retry-to-DLQ path through the actual worker, and a + concurrent-enqueue stress test. +- All P0 and P1 fixes were verified to *fail* on the prior code via + `git stash` / re-run before being committed. + ## [Unreleased] ### Breaking Changes diff --git a/Cargo.lock b/Cargo.lock index 3d05d5f..ea0a9b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -179,7 +179,7 @@ dependencies = [ [[package]] name = "backfill" -version = "1.1.1" +version = "1.2.0" dependencies = [ "axum", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 5ffbb03..03b71b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "backfill" -version = "1.1.1" +version = "1.2.0" edition = "2024" description = "A boringly-named priority work queue system for doing async tasks." categories = ["concurrency", "data-structures"] From f91087d8672d36129a77465d63079cf6c3ed3297 Mon Sep 17 00:00:00 2001 From: C J Silverio Date: Wed, 6 May 2026 20:10:01 -0700 Subject: [PATCH 19/19] chore: cargo +nightly fmt CI runs cargo +nightly fmt --check; my edits were formatted under stable fmt which has slightly different defaults around method-chain wrapping. Apply the nightly formatter so CI's fmt step passes. No behavioural change. --- examples/basic_worker.rs | 3 +-- src/client/cleanup.rs | 6 +++--- src/lib.rs | 35 +++++++++++++++----------------- src/worker.rs | 13 +++++------- tests/dlq_tests.rs | 31 +++++++++------------------- tests/integration_tests_clean.rs | 13 ++++-------- tests/plugin_tests.rs | 14 ++++++------- 7 files changed, 46 insertions(+), 69 deletions(-) diff --git a/examples/basic_worker.rs b/examples/basic_worker.rs index 846563d..c6ed622 100644 --- a/examples/basic_worker.rs +++ b/examples/basic_worker.rs @@ -16,8 +16,7 @@ use std::num::ParseIntError; use std::time::Duration; use backfill::{ - BackfillError, IntoTaskHandlerResult, TaskHandler, WorkerConfig, WorkerContext, WorkerError, - WorkerRunner, + BackfillError, IntoTaskHandlerResult, TaskHandler, WorkerConfig, WorkerContext, WorkerError, WorkerRunner, }; use log::{error, info, warn}; use serde::{Deserialize, Serialize}; diff --git a/src/client/cleanup.rs b/src/client/cleanup.rs index f8ff160..22dd5d0 100644 --- a/src/client/cleanup.rs +++ b/src/client/cleanup.rs @@ -149,9 +149,9 @@ impl BackfillClient { /// /// Safe usage when DLQ is enabled: /// 1. Call `process_failed_jobs()` first (moves rows into DLQ). - /// 2. Then call this function (cleans up anything the DLQ processor - /// chose not to move — typically jobs with `max_attempts = 0`, which - /// the DLQ processor explicitly skips). + /// 2. Then call this function (cleans up anything the DLQ processor chose + /// not to move — typically jobs with `max_attempts = 0`, which the DLQ + /// processor explicitly skips). /// /// `WorkerRunner::run_until_cancelled` already enforces this ordering at /// startup. Direct callers (ad-hoc maintenance scripts, etc.) must enforce diff --git a/src/lib.rs b/src/lib.rs index 13def00..1dd3679 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,9 +14,9 @@ //! [`RetryPolicy`]) //! - **Configurable max-attempts per job** with `fast`, `aggressive`, and //! `conservative` presets -//! - **Permanent-failure short-circuit** — non-retryable `WorkerError` -//! variants land in the DLQ on the first failure instead of waiting for -//! `max_attempts` to exhaust +//! - **Permanent-failure short-circuit** — non-retryable `WorkerError` variants +//! land in the DLQ on the first failure instead of waiting for `max_attempts` +//! to exhaust //! - **Dead letter queue** handling for failed jobs //! - **Type-safe job handlers** using Rust's type system //! - **Low-latency execution** via PostgreSQL LISTEN/NOTIFY @@ -68,7 +68,6 @@ //! ``` use chrono::{DateTime, Utc}; - // === Re-exports from `graphile_worker` === // // The types below are part of backfill's *public* API surface — users @@ -81,12 +80,10 @@ use chrono::{DateTime, Utc}; // // When upgrading the graphile_worker dependency: // 1. Re-run the integration test suite (`cargo nextest run -F axum`). -// 2. Audit this re-export list against the new graphile_worker for any -// rename/removal — those need a corresponding backfill major bump and -// migration note for downstream users. -// 3. Audit `_private_*` schema usage in src/client/dlq.rs, -// src/client/cleanup.rs, and src/admin.rs — graphile_worker reserves -// the right to change those tables across versions. +// 2. Audit this re-export list against the new graphile_worker for any rename/removal — those need a corresponding +// backfill major bump and migration note for downstream users. +// 3. Audit `_private_*` schema usage in src/client/dlq.rs, src/client/cleanup.rs, and src/admin.rs — +// graphile_worker reserves the right to change those tables across versions. // // Lifecycle hooks for plugins - new Plugin API with event registration pub use graphile_worker::{ @@ -307,11 +304,11 @@ impl Queue { /// - [`EnqueueOutcome::Enqueued`] — the job was created (or, with /// `JobKeyMode::Replace`, updated). The boxed `Job` carries the job's /// id/queue_id/etc. -/// - [`EnqueueOutcome::AlreadyInProgress`] — a job with the same `job_key` -/// was **currently locked by a worker** when we tried to add this one. -/// This is *not* a duplicate-key collision (that's handled by -/// `job_key_mode`); it's a race where the worker grabbed the existing -/// job before our update could land. The new payload was discarded. +/// - [`EnqueueOutcome::AlreadyInProgress`] — a job with the same `job_key` was +/// **currently locked by a worker** when we tried to add this one. This is +/// *not* a duplicate-key collision (that's handled by `job_key_mode`); it's a +/// race where the worker grabbed the existing job before our update could +/// land. The new payload was discarded. /// /// # ⚠️ Footgun warning /// @@ -628,10 +625,10 @@ where /// Enqueue a critical job with a high retry count (12 attempts). /// -/// Use for jobs that must eventually succeed if at all possible. graphile_worker -/// retries on a fixed `exp(min(attempts, 10))` second schedule, capping at -/// ~6h per retry — so 12 attempts gives roughly half a day of total retry -/// coverage. See [`RetryPolicy`] for the full timing. +/// Use for jobs that must eventually succeed if at all possible. +/// graphile_worker retries on a fixed `exp(min(attempts, 10))` second schedule, +/// capping at ~6h per retry — so 12 attempts gives roughly half a day of total +/// retry coverage. See [`RetryPolicy`] for the full timing. pub async fn enqueue_critical( client: &BackfillClient, task_identifier: &str, diff --git a/src/worker.rs b/src/worker.rs index 881b00a..91b5b68 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -573,12 +573,12 @@ impl WorkerRunner { // If DLQ is enabled, register the built-in plugins that wire backfill's // error semantics through to graphile_worker: // - // - DlqCleanupPlugin: when a requeued DLQ job completes successfully, - // removes the matching DLQ entry so it isn't requeued again. + // - DlqCleanupPlugin: when a requeued DLQ job completes successfully, removes + // the matching DLQ entry so it isn't requeued again. // - PermanentFailurePlugin: when a handler returns a non-retryable // `WorkerError` variant (ValidationFailed, Unauthorized, etc.), - // short-circuits remaining retry attempts so the job lands in the - // DLQ on the next processor tick instead of waiting hours. + // short-circuits remaining retry attempts so the job lands in the DLQ on the + // next processor tick instead of waiting hours. if self.config.dlq_processor_interval.is_some() { let cleanup_plugin = crate::DlqCleanupPlugin::new(self.client.clone()); worker_options = worker_options.add_plugin(cleanup_plugin); @@ -756,10 +756,7 @@ impl WorkerRunner { Err(e) => { // If this fails, the next call to startup_cleanup may delete the // affected jobs. We log loudly so an operator can investigate. - log::error!( - "Pre-cleanup DLQ move failed; jobs may be lost when cleanup runs: {}", - e - ); + log::error!("Pre-cleanup DLQ move failed; jobs may be lost when cleanup runs: {}", e); } } } diff --git a/tests/dlq_tests.rs b/tests/dlq_tests.rs index 634c8ff..8c026f2 100644 --- a/tests/dlq_tests.rs +++ b/tests/dlq_tests.rs @@ -765,10 +765,7 @@ async fn test_dlq_failure_count_is_touch_count() { dlq_first.id, dlq_second.id, "same job_key should map to the same DLQ row via UPSERT" ); - assert_eq!( - dlq_second.failure_count, 2, - "second DLQ touch should increment to 2" - ); + assert_eq!(dlq_second.failure_count, 2, "second DLQ touch should increment to 2"); } /// Regression test for P1-1: `list_dlq_jobs.total` must reflect the filtered @@ -811,10 +808,7 @@ async fn test_list_dlq_jobs_total_respects_filters() { } // Unfiltered: total = 8. - let all = client - .list_dlq_jobs(DlqFilter::default()) - .await - .expect("list all"); + let all = client.list_dlq_jobs(DlqFilter::default()).await.expect("list all"); assert_eq!(all.total, 8); // Filter by task_a — must report 5, not 8. @@ -859,11 +853,12 @@ async fn test_list_dlq_jobs_total_respects_filters() { /// Regression test for the DLQ-vs-cleanup startup race (P0-1). /// /// Before the fix, `WorkerRunner::run_until_cancelled` called -/// `startup_cleanup_with_timeouts` first, which DELETEs rows from `_private_jobs` -/// where `attempts >= max_attempts`. The DLQ processor was supposed to capture -/// those same rows, but it ran as a periodic background task that hadn't ticked -/// yet at startup. Net effect: jobs that hit max_attempts while the worker was -/// down (or in the gap before the next DLQ tick) were silently deleted. +/// `startup_cleanup_with_timeouts` first, which DELETEs rows from +/// `_private_jobs` where `attempts >= max_attempts`. The DLQ processor was +/// supposed to capture those same rows, but it ran as a periodic background +/// task that hadn't ticked yet at startup. Net effect: jobs that hit +/// max_attempts while the worker was down (or in the gap before the next DLQ +/// tick) were silently deleted. /// /// The fix runs `process_failed_jobs()` synchronously before cleanup when DLQ /// is enabled. This test stages a permanently-failed job, runs the actual @@ -917,10 +912,7 @@ async fn test_worker_startup_moves_failed_jobs_to_dlq_before_cleanup() { .expect("stage permanent failure"); // Sanity check: not in DLQ yet, still in main table. - let dlq_pre = client - .list_dlq_jobs(DlqFilter::default()) - .await - .expect("list DLQ pre"); + let dlq_pre = client.list_dlq_jobs(DlqFilter::default()).await.expect("list DLQ pre"); assert_eq!(dlq_pre.jobs.len(), 0, "DLQ should be empty before worker startup"); // Build and run a real WorkerRunner. The DLQ processor's tick interval is @@ -947,10 +939,7 @@ async fn test_worker_startup_moves_failed_jobs_to_dlq_before_cleanup() { let _ = handle.await; // Verify: the job is in DLQ, not deleted. - let dlq_post = client - .list_dlq_jobs(DlqFilter::default()) - .await - .expect("list DLQ post"); + let dlq_post = client.list_dlq_jobs(DlqFilter::default()).await.expect("list DLQ post"); assert_eq!( dlq_post.jobs.len(), 1, diff --git a/tests/integration_tests_clean.rs b/tests/integration_tests_clean.rs index 09c963d..fe6d531 100644 --- a/tests/integration_tests_clean.rs +++ b/tests/integration_tests_clean.rs @@ -1240,9 +1240,7 @@ async fn test_concurrent_enqueue_under_load() -> Result<()> { message: format!("stress-{task_id}-{job_id}"), number: (task_id * JOBS_PER_TASK + job_id) as i32, }; - let outcome = client - .enqueue("stress_job", &payload, JobSpec::default()) - .await?; + let outcome = client.enqueue("stress_job", &payload, JobSpec::default()).await?; if outcome.is_enqueued() { enqueued += 1; } @@ -1263,12 +1261,9 @@ async fn test_concurrent_enqueue_under_load() -> Result<()> { assert_eq!(total_enqueued, TOTAL, "every concurrent enqueue must succeed"); // Verify every row landed in _private_jobs. - let row_count: (i64,) = sqlx::query_as(&format!( - "SELECT COUNT(*) FROM {}._private_jobs", - client.schema() - )) - .fetch_one(client.pool()) - .await?; + let row_count: (i64,) = sqlx::query_as(&format!("SELECT COUNT(*) FROM {}._private_jobs", client.schema())) + .fetch_one(client.pool()) + .await?; assert_eq!( row_count.0 as usize, TOTAL, "all {} jobs must be persisted in _private_jobs", diff --git a/tests/plugin_tests.rs b/tests/plugin_tests.rs index 4df0356..3c886fc 100644 --- a/tests/plugin_tests.rs +++ b/tests/plugin_tests.rs @@ -543,12 +543,9 @@ async fn test_retry_to_exhaustion_then_dlq_via_worker() -> Result<()> { // The original job row should be gone from _private_jobs (the DLQ // move's atomic CTE deleted it — P1-2). - let count: (i64,) = sqlx::query_as(&format!( - "SELECT COUNT(*) FROM {}._private_jobs", - client.schema() - )) - .fetch_one(client.pool()) - .await?; + let count: (i64,) = sqlx::query_as(&format!("SELECT COUNT(*) FROM {}._private_jobs", client.schema())) + .fetch_one(client.pool()) + .await?; assert_eq!(count.0, 0, "_private_jobs should be empty after DLQ move"); Ok(()) @@ -626,7 +623,10 @@ async fn test_non_retryable_error_short_circuits_retries() -> Result<()> { // can call it directly to verify the end-to-end behaviour: the job // ends up in the DLQ within a single retry-cycle's worth of time. let moved = client.process_failed_jobs().await?; - assert_eq!(moved, 1, "permanent-failure short-circuit should make the job DLQ-eligible"); + assert_eq!( + moved, 1, + "permanent-failure short-circuit should make the job DLQ-eligible" + ); let dlq = client.list_dlq_jobs(DlqFilter::default()).await?; assert_eq!(dlq.jobs.len(), 1);