Skip to content

Commit 3579efe

Browse files
authored
Server: backport expired message cleaner updates (#1156)
Widens the period of the cleaner to be 12h when row counts cleaned dips below the batch size. The legacy cleaner query also has its timeout extended as was done when troubleshooting the cleaner for one deployment. We may need to do the same for the new `messagecontent` table, but for now I left it as-is. Some additional refactors were needed with the 12h period to ensure the process would not block (for hours) when the shutdown atomic gets set. Additionally, since the "worst case" search for the `message` table cleaner is the inevitable conclusion when a deployment starts writing to `messagecontent` instead, a check is done when the cleaner first starts up to see if there are any pending payloads at all in the `message` table. This allows us to skip repeatedly scanning a table that will never have new message payloads added to it.
2 parents fba87d5 + 7e8635b commit 3579efe

File tree

2 files changed

+107
-20
lines changed

2 files changed

+107
-20
lines changed

server/svix-server/src/expired_message_cleaner.rs

Lines changed: 106 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,54 @@
33

44
use std::sync::atomic::Ordering;
55

6-
use crate::error::Result;
7-
use sea_orm::{ConnectionTrait, DatabaseConnection, DbErr, Statement, UpdateResult};
8-
use std::time::Duration;
9-
use tokio::time::sleep;
6+
use crate::error::{Error, Result};
7+
use sea_orm::{
8+
ConnectionTrait, DatabaseConnection, DbErr, ExecResult, QueryResult, Statement,
9+
TransactionTrait, UpdateResult,
10+
};
11+
use std::time::{Duration, Instant};
12+
13+
type DbResult<T> = std::result::Result<T, DbErr>;
14+
15+
async fn exec_without_timeout(pool: &DatabaseConnection, stmt: Statement) -> DbResult<ExecResult> {
16+
let increase_timeout = Statement::from_string(
17+
pool.get_database_backend(),
18+
"SET LOCAL statement_timeout=0;",
19+
);
20+
let tx = pool.begin().await?;
21+
let _ = tx.execute(increase_timeout).await?;
22+
let res = tx.execute(stmt).await?;
23+
tx.commit().await?;
24+
Ok(res)
25+
}
26+
async fn query_one_without_timeout(
27+
pool: &DatabaseConnection,
28+
stmt: Statement,
29+
) -> DbResult<Option<QueryResult>> {
30+
let increase_timeout = Statement::from_string(
31+
pool.get_database_backend(),
32+
"SET LOCAL statement_timeout=0;",
33+
);
34+
let tx = pool.begin().await?;
35+
let _ = tx.execute(increase_timeout).await?;
36+
let res = tx.query_one(stmt).await?;
37+
tx.commit().await?;
38+
Ok(res)
39+
}
1040

1141
/// Nullifies the payload column for expired messages,
1242
/// `limit` sets how many rows to update at a time.
1343
pub async fn clean_expired_messages(
1444
pool: &DatabaseConnection,
1545
limit: u32,
16-
) -> std::result::Result<UpdateResult, DbErr> {
17-
let legacy_stmt = Statement::from_sql_and_values(
18-
pool.get_database_backend(),
19-
r#"
46+
enable_legacy_message_cleaner: bool,
47+
) -> DbResult<UpdateResult> {
48+
// See the docs for [`has_message_payloads_pending_expiry`] for background on the legacy cleaner.
49+
let legacy_row_count = if enable_legacy_message_cleaner {
50+
let legacy_res = {
51+
let legacy_stmt = Statement::from_sql_and_values(
52+
pool.get_database_backend(),
53+
r#"
2054
UPDATE message SET payload = NULL WHERE id IN (
2155
SELECT id FROM message
2256
WHERE
@@ -26,9 +60,15 @@ pub async fn clean_expired_messages(
2660
FOR UPDATE SKIP LOCKED
2761
)
2862
"#,
29-
[limit.into()],
30-
);
31-
let legacy_res = pool.execute(legacy_stmt).await?;
63+
[limit.into()],
64+
);
65+
66+
exec_without_timeout(pool, legacy_stmt).await?
67+
};
68+
legacy_res.rows_affected()
69+
} else {
70+
0
71+
};
3272

3373
let stmt = Statement::from_sql_and_values(
3474
pool.get_database_backend(),
@@ -48,32 +88,79 @@ pub async fn clean_expired_messages(
4888
let res = pool.execute(stmt).await?;
4989

5090
Ok(UpdateResult {
51-
rows_affected: legacy_res.rows_affected() + res.rows_affected(),
91+
rows_affected: legacy_row_count + res.rows_affected(),
5292
})
5393
}
5494

95+
/// Checks to see if the message table has any non-null payloads requiring expiry.
96+
///
97+
/// ## Background
98+
///
99+
/// Initially payloads were modeled as a field in `message`, but later migrated to a separate
100+
/// table (`messagecontent`). In cases where there are no longer any payloads to expire in `message` we
101+
/// can avoid the expense of running the cleaner on the `message` table since all new messages should now be using
102+
/// `messagecontent`.
103+
async fn has_message_payloads_pending_expiry(pool: &DatabaseConnection) -> Result<bool> {
104+
query_one_without_timeout(
105+
pool,
106+
Statement::from_string(
107+
pool.get_database_backend(),
108+
r#"SELECT EXISTS (SELECT 1 FROM message WHERE payload IS NOT NULL LIMIT 1)"#,
109+
),
110+
)
111+
.await?
112+
.ok_or_else(|| Error::generic("failed to check for message payloads"))?
113+
.try_get_by_index(0)
114+
.map_err(|e| Error::generic(format!("failed to check for message payloads: {e}")))
115+
}
116+
55117
/// Polls the database for expired messages to nullify payloads for.
56118
///
57119
/// Uses a variable polling schedule, based on affected row counts each iteration of the loop.
58120
pub async fn expired_message_cleaner_loop(pool: &DatabaseConnection) -> Result<()> {
121+
let message_table_needs_cleaning = has_message_payloads_pending_expiry(pool).await?;
122+
if !message_table_needs_cleaning {
123+
tracing::info!("No payloads pending expiry found in `message` table. Skipping the cleaner for this table.");
124+
}
125+
59126
// When no rows have been updated, widen the interval.
60-
const IDLE: Duration = Duration::from_secs(10);
127+
const IDLE: Duration = Duration::from_secs(60 * 60 * 12);
61128
// When the affected row count dips below this, switch to the `SLOWING` interval.
62-
const SLOWING_THRESHOLD: u64 = 1_000;
63-
const SLOWING: Duration = Duration::from_secs(3);
129+
const SLOWING_THRESHOLD: u64 = 5_000;
130+
const SLOWING: Duration = Duration::from_secs(60 * 60 * 12);
131+
const ON_ERROR: Duration = Duration::from_secs(10);
64132
const BATCH_SIZE: u32 = 5_000;
65-
let mut sleep_time = Some(IDLE);
133+
let mut sleep_time = None;
66134
loop {
67135
if let Some(duration) = sleep_time {
68-
sleep(duration).await;
136+
let sleep_start = Instant::now();
137+
let mut interval = tokio::time::interval(Duration::from_secs(10));
138+
interval.tick().await;
139+
// Doing a plain sleep() was fine when the polling frequency was mere seconds, but since we're doing wider
140+
// periods now (hours, not seconds), we need to be a little more careful about not preventing the process
141+
// from shutting down.
142+
// Using `interval()` so we can track how long we've been sleeping for, while still checking for the
143+
// shutdown signal.
144+
'inner: loop {
145+
if crate::SHUTTING_DOWN.load(Ordering::SeqCst) {
146+
return Ok(());
147+
}
148+
interval.tick().await;
149+
if sleep_start.elapsed() > duration {
150+
break 'inner;
151+
}
152+
}
69153
}
70-
match clean_expired_messages(pool, BATCH_SIZE).await {
154+
155+
let start = Instant::now();
156+
match clean_expired_messages(pool, BATCH_SIZE, message_table_needs_cleaning).await {
71157
Err(err) => {
72158
tracing::error!("{}", err);
159+
sleep_time = Some(ON_ERROR);
73160
}
74161
Ok(UpdateResult { rows_affected }) => {
75162
if rows_affected > 0 {
76-
tracing::trace!("expired {} payloads", rows_affected);
163+
tracing::debug!(elapsed =? start.elapsed(), "expired {} payloads", rows_affected);
77164
}
78165

79166
sleep_time = match rows_affected {

server/svix-server/tests/e2e_message.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,7 @@ async fn test_payload_retention_period() {
411411
.unwrap();
412412
assert_eq!(content.unwrap().id, msg_id.clone());
413413

414-
expired_message_cleaner::clean_expired_messages(&pool, 5000)
414+
expired_message_cleaner::clean_expired_messages(&pool, 5000, false)
415415
.await
416416
.unwrap();
417417

0 commit comments

Comments
 (0)