Server: backport expired message cleaner updates (#1156)

svix-onelson · web-flow · commit 3579efe7f4c7 · 2024-01-09T16:12:01.000-08:00
Widens the period of the cleaner to be 12h when row counts cleaned dips
below the batch size.

The legacy cleaner query also has its timeout extended as was done when
troubleshooting the cleaner for one deployment. We may need to do the
same for the new `messagecontent` table, but for now I left it as-is.

Some additional refactors were needed with the 12h period to ensure the
process would not block (for hours) when the shutdown atomic gets set.

Additionally, since the "worst case" search for the `message` table
cleaner is the inevitable conclusion when a deployment starts writing to
`messagecontent` instead, a check is done when the cleaner first starts
up to see if there are any pending payloads at all in the `message`
table.

This allows us to skip repeatedly scanning a table that will never have
new
message payloads added to it.
diff --git a/server/svix-server/src/expired_message_cleaner.rs b/server/svix-server/src/expired_message_cleaner.rs
@@ -3,20 +3,54 @@
 
 use std::sync::atomic::Ordering;
 
-use crate::error::Result;
-use sea_orm::{ConnectionTrait, DatabaseConnection, DbErr, Statement, UpdateResult};
-use std::time::Duration;
-use tokio::time::sleep;
+use crate::error::{Error, Result};
+use sea_orm::{
+    ConnectionTrait, DatabaseConnection, DbErr, ExecResult, QueryResult, Statement,
+    TransactionTrait, UpdateResult,
+};
+use std::time::{Duration, Instant};
+
+type DbResult<T> = std::result::Result<T, DbErr>;
+
+async fn exec_without_timeout(pool: &DatabaseConnection, stmt: Statement) -> DbResult<ExecResult> {
+    let increase_timeout = Statement::from_string(
+        pool.get_database_backend(),
+        "SET LOCAL statement_timeout=0;",
+    );
+    let tx = pool.begin().await?;
+    let _ = tx.execute(increase_timeout).await?;
+    let res = tx.execute(stmt).await?;
+    tx.commit().await?;
+    Ok(res)
+}
+async fn query_one_without_timeout(
+    pool: &DatabaseConnection,
+    stmt: Statement,
+) -> DbResult<Option<QueryResult>> {
+    let increase_timeout = Statement::from_string(
+        pool.get_database_backend(),
+        "SET LOCAL statement_timeout=0;",
+    );
+    let tx = pool.begin().await?;
+    let _ = tx.execute(increase_timeout).await?;
+    let res = tx.query_one(stmt).await?;
+    tx.commit().await?;
+    Ok(res)
+}
 
 /// Nullifies the payload column for expired messages,
 /// `limit` sets how many rows to update at a time.
 pub async fn clean_expired_messages(
     pool: &DatabaseConnection,
     limit: u32,
-) -> std::result::Result<UpdateResult, DbErr> {
-    let legacy_stmt = Statement::from_sql_and_values(
-        pool.get_database_backend(),
-        r#"
+    enable_legacy_message_cleaner: bool,
+) -> DbResult<UpdateResult> {
+    // See the docs for [`has_message_payloads_pending_expiry`] for background on the legacy cleaner.
+    let legacy_row_count = if enable_legacy_message_cleaner {
+        let legacy_res = {
+            let legacy_stmt = Statement::from_sql_and_values(
+                pool.get_database_backend(),
+                r#"
         UPDATE message SET payload = NULL WHERE id IN (
             SELECT id FROM message
             WHERE
@@ -26,9 +60,15 @@ pub async fn clean_expired_messages(
             FOR UPDATE SKIP LOCKED
         )
     "#,
-        [limit.into()],
-    );
-    let legacy_res = pool.execute(legacy_stmt).await?;
+                [limit.into()],
+            );
+
+            exec_without_timeout(pool, legacy_stmt).await?
+        };
+        legacy_res.rows_affected()
+    } else {
+        0
+    };
 
     let stmt = Statement::from_sql_and_values(
         pool.get_database_backend(),
@@ -48,32 +88,79 @@ pub async fn clean_expired_messages(
     let res = pool.execute(stmt).await?;
 
     Ok(UpdateResult {
-        rows_affected: legacy_res.rows_affected() + res.rows_affected(),
+        rows_affected: legacy_row_count + res.rows_affected(),
     })
 }
 
+/// Checks to see if the message table has any non-null payloads requiring expiry.
+///
+/// ## Background
+///
+/// Initially payloads were modeled as a field in `message`, but later migrated to a separate
+/// table (`messagecontent`). In cases where there are no longer any payloads to expire in `message` we
+/// can avoid the expense of running the cleaner on the `message` table since all new messages should now be using
+/// `messagecontent`.
+async fn has_message_payloads_pending_expiry(pool: &DatabaseConnection) -> Result<bool> {
+    query_one_without_timeout(
+        pool,
+        Statement::from_string(
+            pool.get_database_backend(),
+            r#"SELECT EXISTS (SELECT 1 FROM message WHERE payload IS NOT NULL LIMIT 1)"#,
+        ),
+    )
+    .await?
+    .ok_or_else(|| Error::generic("failed to check for message payloads"))?
+    .try_get_by_index(0)
+    .map_err(|e| Error::generic(format!("failed to check for message payloads: {e}")))
+}
+
 /// Polls the database for expired messages to nullify payloads for.
 ///
 /// Uses a variable polling schedule, based on affected row counts each iteration of the loop.
 pub async fn expired_message_cleaner_loop(pool: &DatabaseConnection) -> Result<()> {
+    let message_table_needs_cleaning = has_message_payloads_pending_expiry(pool).await?;
+    if !message_table_needs_cleaning {
+        tracing::info!("No payloads pending expiry found in `message` table. Skipping the cleaner for this table.");
+    }
+
     // When no rows have been updated, widen the interval.
-    const IDLE: Duration = Duration::from_secs(10);
+    const IDLE: Duration = Duration::from_secs(60 * 60 * 12);
     // When the affected row count dips below this, switch to the `SLOWING` interval.
-    const SLOWING_THRESHOLD: u64 = 1_000;
-    const SLOWING: Duration = Duration::from_secs(3);
+    const SLOWING_THRESHOLD: u64 = 5_000;
+    const SLOWING: Duration = Duration::from_secs(60 * 60 * 12);
+    const ON_ERROR: Duration = Duration::from_secs(10);
     const BATCH_SIZE: u32 = 5_000;
-    let mut sleep_time = Some(IDLE);
+    let mut sleep_time = None;
     loop {
         if let Some(duration) = sleep_time {
-            sleep(duration).await;
+            let sleep_start = Instant::now();
+            let mut interval = tokio::time::interval(Duration::from_secs(10));
+            interval.tick().await;
+            // Doing a plain sleep() was fine when the polling frequency was mere seconds, but since we're doing wider
+            // periods now (hours, not seconds), we need to be a little more careful about not preventing the process
+            // from shutting down.
+            // Using `interval()` so we can track how long we've been sleeping for, while still checking for the
+            // shutdown signal.
+            'inner: loop {
+                if crate::SHUTTING_DOWN.load(Ordering::SeqCst) {
+                    return Ok(());
+                }
+                interval.tick().await;
+                if sleep_start.elapsed() > duration {
+                    break 'inner;
+                }
+            }
         }
-        match clean_expired_messages(pool, BATCH_SIZE).await {
+
+        let start = Instant::now();
+        match clean_expired_messages(pool, BATCH_SIZE, message_table_needs_cleaning).await {
             Err(err) => {
                 tracing::error!("{}", err);
+                sleep_time = Some(ON_ERROR);
             }
             Ok(UpdateResult { rows_affected }) => {
                 if rows_affected > 0 {
-                    tracing::trace!("expired {} payloads", rows_affected);
+                    tracing::debug!(elapsed =? start.elapsed(), "expired {} payloads", rows_affected);
                 }
 
                 sleep_time = match rows_affected {
diff --git a/server/svix-server/tests/e2e_message.rs b/server/svix-server/tests/e2e_message.rs
@@ -411,7 +411,7 @@ async fn test_payload_retention_period() {
         .unwrap();
     assert_eq!(content.unwrap().id, msg_id.clone());
 
-    expired_message_cleaner::clean_expired_messages(&pool, 5000)
+    expired_message_cleaner::clean_expired_messages(&pool, 5000, false)
         .await
         .unwrap();