chore: improve logging when a table sync worker is in error (#366)

imor · web-flow · commit 02a5172e1c31 · 2025-09-22T18:24:17.000+05:30
diff --git a/etl/src/replication/common.rs b/etl/src/replication/common.rs
@@ -5,22 +5,18 @@ use crate::error::EtlResult;
 use crate::state::table::TableReplicationPhase;
 use crate::store::state::StateStore;
 
-/// Returns the table replication states that are either done or in active state.
+/// Returns the table replication states that are not yet done.
 ///
 /// A table is considered in done state when the apply worker doesn't need to start/restart a table
 /// sync worker to make that table progress.
-pub async fn get_table_replication_states<S>(
+pub async fn get_active_table_replication_states<S>(
     state_store: &S,
-    done: bool,
 ) -> EtlResult<HashMap<TableId, TableReplicationPhase>>
 where
     S: StateStore + Clone + Send + Sync + 'static,
 {
     let mut table_replication_states = state_store.get_table_replication_states().await?;
-    table_replication_states.retain(|_table_id, state| match done {
-        true => state.as_type().is_done(),
-        false => !state.as_type().is_done(),
-    });
+    table_replication_states.retain(|_table_id, state| !state.as_type().is_done());
 
     Ok(table_replication_states)
 }
diff --git a/etl/src/state/table.rs b/etl/src/state/table.rs
@@ -320,6 +320,11 @@ impl TableReplicationPhaseType {
         }
     }
 
+    /// Return `true` if a table with this phase is in error, `false` otherwise.
+    pub fn is_errored(&self) -> bool {
+        matches!(self, Self::Errored)
+    }
+
     pub fn as_static_str(&self) -> &'static str {
         match self {
             TableReplicationPhaseType::Init => "init",
diff --git a/etl/src/workers/apply.rs b/etl/src/workers/apply.rs
@@ -4,6 +4,7 @@ use std::sync::Arc;
 use tokio::sync::Semaphore;
 use tokio::task::JoinHandle;
 use tokio_postgres::types::PgLsn;
+use tracing::warn;
 use tracing::{Instrument, debug, error, info};
 
 use crate::concurrency::shutdown::ShutdownRx;
@@ -14,7 +15,7 @@ use crate::error::{ErrorKind, EtlError, EtlResult};
 use crate::etl_error;
 use crate::replication::apply::{ApplyLoopAction, ApplyLoopHook, start_apply_loop};
 use crate::replication::client::PgReplicationClient;
-use crate::replication::common::get_table_replication_states;
+use crate::replication::common::get_active_table_replication_states;
 use crate::state::table::{
     TableReplicationError, TableReplicationPhase, TableReplicationPhaseType,
 };
@@ -387,30 +388,34 @@ where
     async fn before_loop(&self, _start_lsn: PgLsn) -> EtlResult<ApplyLoopAction> {
         info!("starting table sync workers before the main apply loop");
 
-        let active_table_replication_states =
-            get_table_replication_states(&self.store, false).await?;
-
-        for (table_id, table_replication_phase) in active_table_replication_states {
-            // A table in `SyncDone` doesn't need to have its worker started, since the main apply
-            // worker will move it into `Ready` state automatically once the condition is met.
-            if let TableReplicationPhaseType::SyncDone = table_replication_phase.as_type() {
-                continue;
-            }
+        for (table_id, table_replication_phase) in self.store.get_table_replication_states().await?
+        {
+            if !table_replication_phase.as_type().is_done() {
+                // A table in `SyncDone` doesn't need to have its worker started, since the main apply
+                // worker will move it into `Ready` state automatically once the condition is met.
+                if let TableReplicationPhaseType::SyncDone = table_replication_phase.as_type() {
+                    continue;
+                }
 
-            // If there is already an active worker for this table in the pool, we can avoid starting
-            // it.
-            let mut pool = self.pool.lock().await;
-            if pool.get_active_worker_state(table_id).is_some() {
-                continue;
-            }
+                // If there is already an active worker for this table in the pool, we can avoid starting
+                // it.
+                let mut pool = self.pool.lock().await;
+                if pool.get_active_worker_state(table_id).is_some() {
+                    continue;
+                }
 
-            // If we fail, we just show an error, and hopefully we will succeed when starting it
-            // during syncing tables.
-            let table_sync_worker = self.build_table_sync_worker(table_id).await;
-            if let Err(err) = pool.start_worker(table_sync_worker).await {
-                error!(
-                    "error starting table sync worker for table {} during initialization: {}",
-                    table_id, err
+                // If we fail, we just show an error, and hopefully we will succeed when starting it
+                // during syncing tables.
+                let table_sync_worker = self.build_table_sync_worker(table_id).await;
+                if let Err(err) = pool.start_worker(table_sync_worker).await {
+                    error!(
+                        "error starting table sync worker for table {} during initialization: {}",
+                        table_id, err
+                    );
+                }
+            } else if table_replication_phase.as_type().is_errored() {
+                warn!(
+                    "table sync worker for table {table_id} won't run because it is in an errored state."
                 );
             }
         }
@@ -429,7 +434,7 @@ where
         update_state: bool,
     ) -> EtlResult<ApplyLoopAction> {
         let active_table_replication_states =
-            get_table_replication_states(&self.store, false).await?;
+            get_active_table_replication_states(&self.store).await?;
         debug!(
             "processing syncing tables for apply worker with lsn {}",
             current_lsn

Original file line number	Diff line number	Diff line change
`@@ -320,6 +320,11 @@ impl TableReplicationPhaseType {`
`320`	`320`	`}`
`321`	`321`	`}`
`322`	`322`
	`323`	+ /// Return `true` if a table with this phase is in error, `false` otherwise.
	`324`	`+ pub fn is_errored(&self) -> bool {`
	`325`	`+ matches!(self, Self::Errored)`
	`326`	`+ }`
	`327`	`+`
`323`	`328`	`pub fn as_static_str(&self) -> &'static str {`
`324`	`329`	`match self {`
`325`	`330`	`TableReplicationPhaseType::Init => "init",`